tgc_all <- read_csv("../TGC_data_281022.csv")
## Rows: 13000 Columns: 105
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (31): TGC_sangerlane, Data Accession, Study Accession, Sample Accession,...
## dbl (66): CORE MATCHES, % CORE FAMILIES, % NON-CORE, GENOME LENGTH, N50, NO....
## lgl (8): MDR, cipNS, cipR, XDR, aziR, untargetedSurveillance, assumedAcute,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# total genomes
nrow(tgc_all)
## [1] 13000
# genomes excluded due to size
table(tgc_all$exclude_assembly)
##
## FALSE TRUE
## 12965 35
tgc_all <- tgc_all %>%
mutate(IncHI1=grepl("IncHI1",`Inc Types`))
table(tgc_all$IncHI1, tgc_all$exclude_assembly)
##
## FALSE TRUE
## FALSE 12007 32
## TRUE 958 3
genome_sizes <- tgc_all %>%
ggplot( aes(y=`GENOME LENGTH`, x=IncHI1)) +
geom_violin() +
theme_bw() +
geom_hline(yintercept = 4.5E6, linetype="dashed", color="blue") +
geom_hline(yintercept = 5.5E6, linetype="dashed", color="blue") +
labs(x="IncHI1 detected", y="Genome length (bp)", subtitle="a) All genomes, pre-filter") +
scale_x_discrete(labels=c(paste0("yes (n=",sum(tgc_all$IncHI1),")"), paste0("no (n=",sum(!tgc_all$IncHI1),")")))
genome_sizes
genome_sizes_included <- tgc_all %>%
filter(exclude_assembly != TRUE) %>%
ggplot( aes(y=`GENOME LENGTH`, x=IncHI1)) +
geom_violin() +
theme_bw() +
labs(x="IncHI1 detected", y="Genome length (bp)", subtitle="b) Included genomes") +
scale_x_discrete(labels=c(paste0("yes (n=",sum(tgc_all$IncHI1[!tgc_all$exclude_assembly]),")"), paste0("no (n=",sum(!tgc_all$IncHI1[!tgc_all$exclude_assembly]),")")))
genome_sizes_included
pdf(paste0("SuppFig1_genomeSize_by_inc.pdf"), width=6, height=4)
genome_sizes | genome_sizes_included
dev.off()
## quartz_off_screen
## 2
png(paste0("SuppFig1_genomeSize_by_inc.png"), width=6, height=4, units="in", res=400)
genome_sizes | genome_sizes_included
dev.off()
## quartz_off_screen
## 2
tgc_all <- tgc_all %>% filter(exclude_assembly != TRUE)
# total high quality genomes
nrow(tgc_all)
## [1] 12965
# number of countries
length(table(tgc_all$Country_Origin))
## [1] 110
# assumed acute cases
table(tgc_all$assumedAcute)
##
## FALSE TRUE
## 134 12831
# non-targeted
tgc_all %>% filter(startsWith(`Purpose of Sampling`, "Non")) %>% summarise(n=n())
## # A tibble: 1 × 1
## n
## <int>
## 1 11086
# targeted
tgc_all %>% filter(startsWith(`Purpose of Sampling`, "Targeted")) %>% summarise(n=n())
## # A tibble: 1 × 1
## n
## <int>
## 1 1862
# purpose not provided
tgc_all %>% filter(`Purpose of Sampling` == "Not Provided") %>% summarise(n=n())
## # A tibble: 1 × 1
## n
## <int>
## 1 17
source_vs_health <- table(tgc_all$Source,tgc_all$`Host Health State`)
source_vs_health <- rbind(source_vs_health, Total=colSums(source_vs_health))
source_vs_health <- source_vs_health[c("Blood", "Stool", "Urine", "Gallbladder", "Environment", "Other", "Not Provided", "Total"),c("Symptomatic", "Asymptomatic Carrier", "Not Provided")]
write.csv(source_vs_health, file=paste0("SuppTable3_health_vs_source_excludeFails.csv"))
region_summary <- tgc_all %>%
group_by(Region)%>%
summarize(region_total =n(),
post2010_rep =sum(Year >= 2010 & assumedAcute & untargetedSurveillance),
travel_post2010=sum(`Travel Associated`=="Yes" & Year >= 2010 & assumedAcute & untargetedSurveillance),
) %>%
adorn_totals("row") %>%
mutate(travel_post2010_pc = paste0(travel_post2010, " (",
round(travel_post2010/post2010_rep*100,1), "%)")
) %>%
select(Region, region_total, post2010_rep, travel_post2010_pc) %>%
rename(`Total genomes`=region_total, `Representative cases from 2010`=post2010_rep, `Travel (%) amongst representative cases from 2010`=travel_post2010_pc) %>%
mutate(Region=replace(Region, is.na(Region), "Unknown"))
region_summary
## Region Total genomes Representative cases from 2010
## Australia and New Zealand 57 57
## Caribbean 20 20
## Central America 103 100
## Eastern Africa 1106 830
## Eastern Asia 12 3
## Eastern Europe 3 1
## Melanesia 232 37
## Micronesia 4 1
## Middle Africa 59 21
## Northern Africa 41 6
## Northern America 167 140
## Northern Europe 109 105
## Polynesia 324 262
## South America 367 105
## South-eastern Asia 1140 584
## Southern Africa 317 286
## Southern Asia 8231 6623
## Southern Europe 10 6
## Western Africa 384 267
## Western Asia 47 21
## Western Europe 7 3
## Unknown 225 0
## Total 12965 9478
## Travel (%) amongst representative cases from 2010
## 0 (0%)
## 20 (100%)
## 100 (100%)
## 49 (5.9%)
## 3 (100%)
## 1 (100%)
## 30 (81.1%)
## 1 (100%)
## 6 (28.6%)
## 6 (100%)
## 2 (1.4%)
## 0 (0%)
## 45 (17.2%)
## 5 (4.8%)
## 72 (12.3%)
## 2 (0.7%)
## 1878 (28.4%)
## 6 (100%)
## 34 (12.7%)
## 21 (100%)
## 3 (100%)
## 0 (NaN%)
## 2284 (24.1%)
write.csv(region_summary, file=paste0("Table1_region_summary.csv"), row.names=F)
country_summary <- tgc_all %>%
group_by(Region, Country_Origin)%>%
summarize(n =n(),
travel = sum(`Travel Associated`=="Yes"),
untargeted_from2010 =sum(Year >= 2010 & assumedAcute & untargetedSurveillance),
travel_untargeted_from2010 = sum(`Travel Associated`=="Yes" & Year >= 2010 & assumedAcute & untargetedSurveillance),
) %>%
adorn_totals("row") %>%
mutate(travel_pc = paste0(travel, " (", round(travel/n*100,1), "%)")) %>%
mutate(travel_untargeted_from2010_pc = paste0(travel_untargeted_from2010, " (", round(travel_untargeted_from2010/untargeted_from2010*100,1), "%)")) %>%
select(Region, Country_Origin, n, travel_pc, untargeted_from2010, travel_untargeted_from2010_pc) %>%
rename(`Total genomes`=n,
`Travel (%)`=travel_pc,
`Representative cases from 2010`=untargeted_from2010,
`Travel (%) amongst representative cases from 2010`=travel_untargeted_from2010_pc) %>%
mutate(Region=replace(Region, is.na(Region), "Unknown"))
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
write.csv(country_summary, file=paste0("SuppTable4_country_summary.csv"), row.names=F)
country_count_by_region <- table(paste0(tgc_all$Region, ": ", tgc_all$Country_Origin))
sum(country_count_by_region>=20)
## [1] 36
sum(country_count_by_region[country_count_by_region>=20])-225 # subtract the 'unknown' category
## [1] 12410
(sum(country_count_by_region[country_count_by_region>=20])-225)/nrow(tgc_all)
## [1] 0.9571924
sum(country_count_by_region>=100)
## [1] 21
sum(country_count_by_region[country_count_by_region>=100])-225 # subtract the 'unknown' category
## [1] 11762
(sum(country_count_by_region[country_count_by_region>=100])-225)/nrow(tgc_all)
## [1] 0.9072117
table(tgc_all$`Travel Associated`)
##
## No Not Provided Yes
## 4736 4848 3381
# labs contirbuting travel-associated isolates
tgc_all %>% group_by(`Isolating Lab`, `Travel Associated`) %>% summarise (n=n()) %>% filter(`Travel Associated`=="Yes")
## `summarise()` has grouped output by 'Isolating Lab'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 3
## # Groups: Isolating Lab [14]
## `Isolating Lab` `Travel Associated` n
## <chr> <chr> <int>
## 1 CDC Yes 749
## 2 ESR Yes 144
## 3 IDS Yes 12
## 4 IPA Yes 116
## 5 KDC Yes 8
## 6 MDU Yes 490
## 7 NCD Yes 13
## 8 NDJ Yes 104
## 9 NIP Yes 1
## 10 PHE Yes 1740
## 11 TDC Yes 1
## 12 UBS Yes 1
## 13 USF Yes 1
## 14 WHP Yes 1
# country of origin for travel-associated isolates
tgc_all %>% group_by(Country_Origin, `Travel Associated`) %>% summarise (n=n()) %>% filter(`Travel Associated`=="Yes") %>% filter(n>35)
## `summarise()` has grouped output by 'Country_Origin'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 3
## # Groups: Country_Origin [12]
## Country_Origin `Travel Associated` n
## <chr> <chr> <int>
## 1 Bangladesh Yes 264
## 2 Chile Yes 49
## 3 Fiji Yes 102
## 4 India Yes 1241
## 5 Indonesia Yes 39
## 6 Mexico Yes 60
## 7 Nepal Yes 39
## 8 Nigeria Yes 42
## 9 Not Provided Yes 193
## 10 Pakistan Yes 783
## 11 Papua New Guinea Yes 45
## 12 Samoa Yes 87
# surveillance-ready samples only
tgc_meta <- tgc_all %>%
filter(assumedAcute == T,
untargetedSurveillance == T,
Country_Origin != "Not Provided")
nrow(tgc_meta)
## [1] 10726
# surveillance-ready samples, known years, 2010 onwards
tgc_from2010 <- tgc_all %>%
filter(assumedAcute == T,
untargetedSurveillance == T,
Country_Origin != "Not Provided",
Year >=2010)
nrow(tgc_from2010)
## [1] 9478
nrow(tgc_from2010)/nrow(tgc_meta)
## [1] 0.8836472
region_N_from2010 <- tgc_from2010 %>%
group_by(Region)%>%
summarize(N =n())
tgc_from2010_regionFreq_allGeno <- tgc_from2010 %>%
group_by(Region, Final_genotype)%>%
summarize(x =n()) %>%
left_join(region_N_from2010) %>% # region counts
mutate(p = x/N) %>% # proportion per region
mutate(se = sqrt(p*(1-p)/N)) %>%
mutate(p_lowerCI = p-1.96*se, p_upperCI = p+1.96*se) %>%
mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
select(-se)
## `summarise()` has grouped output by 'Region'. You can override using the `.groups` argument.
## Joining, by = "Region"
# annual counts/freqs
region_N_annual <- tgc_from2010 %>%
group_by(Region, Year)%>%
summarize(N =n())
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
tgc_from2010_regionFreq_allGeno_annual <- tgc_from2010 %>%
group_by(Region, Final_genotype, Year)%>%
summarize(x =n()) %>%
left_join(region_N_annual) %>% # regional annual
mutate(p = x/N) %>% # proportion per region
mutate(se = sqrt(p*(1-p)/N)) %>%
mutate(p_lowerCI = p-1.96*se, p_upperCI = p+1.96*se) %>%
mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
select(-se)
## `summarise()` has grouped output by 'Region', 'Final_genotype'. You can override using the `.groups` argument.
## Joining, by = c("Region", "Year")
# merge annual counts with total counts into one table for printing
genotypes_by_region <- tgc_from2010_regionFreq_allGeno %>%
mutate(Year="all") %>%
bind_rows(tgc_from2010_regionFreq_allGeno_annual) %>%
arrange(Region, Year, Final_genotype) %>%
rename(Genotype=Final_genotype) %>%
relocate(Year, .after=Genotype)
write.csv(genotypes_by_region, file=paste0("SuppTable5_genotypeFreqs_byRegion_2010-2020.csv"), row.names=F)
country_N_from2010 <- tgc_from2010 %>%
group_by(Country_Origin)%>%
summarize(N =n())
tgc_from2010_countryFreq_allGeno <- tgc_from2010 %>%
group_by(Region, Country_Origin, Final_genotype)%>%
summarize(x =n()) %>%
left_join(country_N_from2010) %>% # country counts
mutate(p = x/N) %>% # proportion per region
mutate(se = sqrt(p*(1-p)/N)) %>%
mutate(p_lowerCI = p-1.96*se,
p_upperCI = p+1.96*se) %>%
mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
select(-se)
## `summarise()` has grouped output by 'Region', 'Country_Origin'. You can override using the `.groups` argument.
## Joining, by = "Country_Origin"
# annual counts/freqs
country_N_annual <- tgc_from2010 %>%
group_by(Country_Origin, Year)%>%
summarize(N =n())
## `summarise()` has grouped output by 'Country_Origin'. You can override using the
## `.groups` argument.
tgc_from2010_countryFreq_allGeno_annual <- tgc_from2010 %>%
group_by(Region, Country_Origin, Final_genotype, Year)%>%
summarize(x =n()) %>%
left_join(country_N_annual) %>% # country annual
mutate(p = x/N) %>% # proportion per country
mutate(se = sqrt(p*(1-p)/N)) %>%
mutate(p_lowerCI = p-1.96*se, p_upperCI = p+1.96*se) %>%
mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
select(-se)
## `summarise()` has grouped output by 'Region', 'Country_Origin', 'Final_genotype'. You can override using the `.groups` argument.
## Joining, by = c("Country_Origin", "Year")
# merge annual counts with total counts into one table for printing
genotypes_by_country <- tgc_from2010_countryFreq_allGeno %>%
mutate(Year="all") %>%
bind_rows(tgc_from2010_countryFreq_allGeno_annual) %>%
arrange(Region, Country_Origin, Year, Final_genotype) %>%
rename(Genotype=Final_genotype) %>%
relocate(Year, .after=Genotype)
write.csv(genotypes_by_country, file=paste0("SuppTable6_genotypeFreqs_byCountry_2010-2020.csv"), row.names=F)
# H58 regional stats
tgc_from2010_regionFreq_h58 <- tgc_from2010 %>%
mutate(h58 = startsWith(Final_genotype,"4.3.1")) %>%
group_by(Region, h58)%>%
summarize(x =n()) %>%
left_join(region_N_from2010) %>% # country counts
mutate(p = x/N) %>% # proportion per region
mutate(se = sqrt(p*(1-p)/N)) %>%
mutate(p_lowerCI = p-1.96*se,
p_upperCI = p+1.96*se) %>%
mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
select(-se)
## `summarise()` has grouped output by 'Region'. You can override using the `.groups` argument.
## Joining, by = "Region"
tgc_from2010_regionFreq_h58
## # A tibble: 34 × 7
## # Groups: Region [21]
## Region h58 x N p p_lowerCI p_upperCI
## <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 Australia and New Zealand FALSE 51 57 0.895 0.815 0.974
## 2 Australia and New Zealand TRUE 6 57 0.105 0.0256 0.185
## 3 Caribbean FALSE 20 20 1 1 1
## 4 Central America FALSE 97 100 0.97 0.937 1
## 5 Central America TRUE 3 100 0.03 0 0.0634
## 6 Eastern Africa FALSE 56 830 0.0675 0.0504 0.0845
## 7 Eastern Africa TRUE 774 830 0.933 0.915 0.950
## 8 Eastern Asia FALSE 2 3 0.667 0.133 1
## 9 Eastern Asia TRUE 1 3 0.333 0 0.867
## 10 Eastern Europe FALSE 1 1 1 1 1
## # … with 24 more rows
# H58 lineage 1 regional stats
tgc_from2010_regionFreq_h58L1 <- tgc_from2010 %>%
mutate(h58L1 = startsWith(Final_genotype,"4.3.1.1")) %>%
group_by(Region, h58L1)%>%
summarize(x =n()) %>%
left_join(region_N_from2010) %>% # country counts
mutate(p = x/N) %>% # proportion per region
mutate(se = sqrt(p*(1-p)/N)) %>%
mutate(p_lowerCI = p-1.96*se,
p_upperCI = p+1.96*se) %>%
mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
select(-se)
## `summarise()` has grouped output by 'Region'. You can override using the `.groups` argument.
## Joining, by = "Region"
tgc_from2010_regionFreq_h58L1
## # A tibble: 31 × 7
## # Groups: Region [21]
## Region h58L1 x N p p_lowerCI p_upperCI
## <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 Australia and New Zealand FALSE 55 57 0.965 0.917 1
## 2 Australia and New Zealand TRUE 2 57 0.0351 0 0.0829
## 3 Caribbean FALSE 20 20 1 1 1
## 4 Central America FALSE 98 100 0.98 0.953 1
## 5 Central America TRUE 2 100 0.02 0 0.0474
## 6 Eastern Africa FALSE 179 830 0.216 0.188 0.244
## 7 Eastern Africa TRUE 651 830 0.784 0.756 0.812
## 8 Eastern Asia FALSE 3 3 1 1 1
## 9 Eastern Europe FALSE 1 1 1 1 1
## 10 Melanesia FALSE 37 37 1 1 1
## # … with 21 more rows
# H58 lineage 2 regional stats
tgc_from2010_regionFreq_h58L2 <- tgc_from2010 %>%
mutate(h58L2 = startsWith(Final_genotype,"4.3.1.2")) %>%
group_by(Region, h58L2)%>%
summarize(x =n()) %>%
left_join(region_N_from2010) %>% # country counts
mutate(p = x/N) %>% # proportion per region
mutate(se = sqrt(p*(1-p)/N)) %>%
mutate(p_lowerCI = p-1.96*se,
p_upperCI = p+1.96*se) %>%
mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
select(-se)
## `summarise()` has grouped output by 'Region'. You can override using the `.groups` argument.
## Joining, by = "Region"
tgc_from2010_regionFreq_h58L2
## # A tibble: 34 × 7
## # Groups: Region [21]
## Region h58L2 x N p p_lowerCI p_upperCI
## <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 Australia and New Zealand FALSE 53 57 0.930 0.864 0.996
## 2 Australia and New Zealand TRUE 4 57 0.0702 0.00386 0.136
## 3 Caribbean FALSE 20 20 1 1 1
## 4 Central America FALSE 99 100 0.99 0.970 1
## 5 Central America TRUE 1 100 0.01 0 0.0295
## 6 Eastern Africa FALSE 707 830 0.852 0.828 0.876
## 7 Eastern Africa TRUE 123 830 0.148 0.124 0.172
## 8 Eastern Asia FALSE 2 3 0.667 0.133 1
## 9 Eastern Asia TRUE 1 3 0.333 0 0.867
## 10 Eastern Europe FALSE 1 1 1 1 1
## # … with 24 more rows
# H58 country stats
tgc_from2010_countryFreq_h58 <- tgc_from2010 %>%
mutate(h58 = startsWith(Final_genotype,"4.3.1")) %>%
group_by(Region, Country_Origin, h58)%>%
summarize(x =n()) %>%
left_join(country_N_from2010) %>% # country counts
mutate(p = x/N) %>% # proportion per country
mutate(se = sqrt(p*(1-p)/N)) %>%
mutate(p_lowerCI = p-1.96*se,
p_upperCI = p+1.96*se) %>%
mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
select(-se)
## `summarise()` has grouped output by 'Region', 'Country_Origin'. You can override using the `.groups` argument.
## Joining, by = "Country_Origin"
tgc_from2010_countryFreq_h58
## # A tibble: 111 × 8
## # Groups: Region, Country_Origin [82]
## Region Country_Origin h58 x N p p_lowerCI p_upperCI
## <chr> <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 Australia and Ne… Australia FALSE 7 12 0.583 0.304 0.862
## 2 Australia and Ne… Australia TRUE 5 12 0.417 0.138 0.696
## 3 Australia and Ne… New Zealand FALSE 44 45 0.978 0.935 1
## 4 Australia and Ne… New Zealand TRUE 1 45 0.0222 0 0.0653
## 5 Caribbean Dominican Rep… FALSE 6 6 1 1 1
## 6 Caribbean Haiti FALSE 12 12 1 1 1
## 7 Caribbean Jamaica FALSE 1 1 1 1 1
## 8 Caribbean Turks and Cai… FALSE 1 1 1 1 1
## 9 Central America El Salvador FALSE 19 19 1 1 1
## 10 Central America Guatemala FALSE 22 22 1 1 1
## # … with 101 more rows
# H58 L1 country stats
tgc_from2010_countryFreq_h58L1 <- tgc_from2010 %>%
mutate(h58L1 = startsWith(Final_genotype,"4.3.1.1")) %>%
group_by(Region, Country_Origin, h58L1)%>%
summarize(x =n()) %>%
left_join(country_N_from2010) %>% # country counts
mutate(p = x/N) %>% # proportion per region
mutate(se = sqrt(p*(1-p)/N)) %>%
mutate(p_lowerCI = p-1.96*se,
p_upperCI = p+1.96*se) %>%
mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
select(-se)
## `summarise()` has grouped output by 'Region', 'Country_Origin'. You can override using the `.groups` argument.
## Joining, by = "Country_Origin"
tgc_from2010_countryFreq_h58L1
## # A tibble: 107 × 8
## # Groups: Region, Country_Origin [82]
## Region Country_Origin h58L1 x N p p_lowerCI p_upperCI
## <chr> <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 Australia and New… Australia FALSE 10 12 0.833 0.622 1
## 2 Australia and New… Australia TRUE 2 12 0.167 0 0.378
## 3 Australia and New… New Zealand FALSE 45 45 1 1 1
## 4 Caribbean Dominican Rep… FALSE 6 6 1 1 1
## 5 Caribbean Haiti FALSE 12 12 1 1 1
## 6 Caribbean Jamaica FALSE 1 1 1 1 1
## 7 Caribbean Turks and Cai… FALSE 1 1 1 1 1
## 8 Central America El Salvador FALSE 19 19 1 1 1
## 9 Central America Guatemala FALSE 22 22 1 1 1
## 10 Central America Mexico FALSE 57 58 0.983 0.949 1
## # … with 97 more rows
# H58 L2 country stats
tgc_from2010_countryFreq_h58L2 <- tgc_from2010 %>%
mutate(h58L2 = startsWith(Final_genotype,"4.3.1.2")) %>%
group_by(Region, Country_Origin, h58L2)%>%
summarize(x =n()) %>%
left_join(country_N_from2010) %>% # country counts
mutate(p = x/N) %>% # proportion per region
mutate(se = sqrt(p*(1-p)/N)) %>%
mutate(p_lowerCI = p-1.96*se,
p_upperCI = p+1.96*se) %>%
mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
select(-se)
## `summarise()` has grouped output by 'Region', 'Country_Origin'. You can override using the `.groups` argument.
## Joining, by = "Country_Origin"
tgc_from2010_countryFreq_h58L2
## # A tibble: 107 × 8
## # Groups: Region, Country_Origin [82]
## Region Country_Origin h58L2 x N p p_lowerCI p_upperCI
## <chr> <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 Australia and Ne… Australia FALSE 9 12 0.75 0.505 0.995
## 2 Australia and Ne… Australia TRUE 3 12 0.25 0.00500 0.495
## 3 Australia and Ne… New Zealand FALSE 44 45 0.978 0.935 1
## 4 Australia and Ne… New Zealand TRUE 1 45 0.0222 0 0.0653
## 5 Caribbean Dominican Rep… FALSE 6 6 1 1 1
## 6 Caribbean Haiti FALSE 12 12 1 1 1
## 7 Caribbean Jamaica FALSE 1 1 1 1 1
## 8 Caribbean Turks and Cai… FALSE 1 1 1 1 1
## 9 Central America El Salvador FALSE 19 19 1 1 1
## 10 Central America Guatemala FALSE 22 22 1 1 1
## # … with 97 more rows
# Southern Asia
tgc_from2010_regionFreq_h58 %>% filter(Region=="Southern Asia") %>% filter(h58)
## # A tibble: 1 × 7
## # Groups: Region [1]
## Region h58 x N p p_lowerCI p_upperCI
## <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 Southern Asia TRUE 4662 6623 0.704 0.693 0.715
tgc_from2010_countryFreq_h58 %>% filter(Region=="Southern Asia") %>% filter(h58)
## # A tibble: 6 × 8
## # Groups: Region, Country_Origin [6]
## Region Country_Origin h58 x N p p_lowerCI p_upperCI
## <chr> <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 Southern Asia Afghanistan TRUE 5 5 1 1 1
## 2 Southern Asia Bangladesh TRUE 670 1591 0.421 0.397 0.445
## 3 Southern Asia India TRUE 1655 2267 0.730 0.712 0.748
## 4 Southern Asia Nepal TRUE 941 1275 0.738 0.714 0.762
## 5 Southern Asia Pakistan TRUE 1390 1484 0.937 0.924 0.949
## 6 Southern Asia Sri Lanka TRUE 1 1 1 1 1
tgc_from2010_countryFreq_h58L1 %>% filter(Region=="Southern Asia") %>% filter(h58L1)
## # A tibble: 5 × 8
## # Groups: Region, Country_Origin [5]
## Region Country_Origin h58L1 x N p p_lowerCI p_upperCI
## <chr> <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 Southern Asia Afghanistan TRUE 5 5 1 1 1
## 2 Southern Asia Bangladesh TRUE 546 1591 0.343 0.320 0.367
## 3 Southern Asia India TRUE 268 2267 0.118 0.105 0.132
## 4 Southern Asia Nepal TRUE 63 1275 0.0494 0.0375 0.0613
## 5 Southern Asia Pakistan TRUE 1089 1484 0.734 0.711 0.756
tgc_from2010_countryFreq_h58L2 %>% filter(Region=="Southern Asia") %>% filter(h58L2)
## # A tibble: 4 × 8
## # Groups: Region, Country_Origin [4]
## Region Country_Origin h58L2 x N p p_lowerCI p_upperCI
## <chr> <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 Southern Asia Bangladesh TRUE 9 1591 0.00566 0.00197 0.00934
## 2 Southern Asia India TRUE 1214 2267 0.536 0.515 0.556
## 3 Southern Asia Nepal TRUE 726 1275 0.569 0.542 0.597
## 4 Southern Asia Pakistan TRUE 47 1484 0.0317 0.0228 0.0406
genotypes_by_country %>% filter(Region=="Southern Asia") %>% filter(Genotype=="4.3.1") %>% filter(Year=="all")
## # A tibble: 5 × 9
## # Groups: Region, Country_Origin [5]
## Region Country_Origin Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Souther… Bangladesh 4.3.1 all 2 1591 0.00126 0 0.00300
## 2 Souther… India 4.3.1 all 168 2267 0.0741 0.0633 0.0849
## 3 Souther… Nepal 4.3.1 all 152 1275 0.119 0.101 0.137
## 4 Souther… Pakistan 4.3.1 all 254 1484 0.171 0.152 0.190
## 5 Souther… Sri Lanka 4.3.1 all 1 1 1 1 1
genotypes_by_country %>% filter(Country_Origin=="Pakistan") %>% filter(Genotype=="4.3.1.1.P1")
## # A tibble: 6 × 9
## # Groups: Region, Country_Origin [1]
## Region Country_Origin Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Southern… Pakistan 4.3.1.1… 2016 1 96 0.0104 0 0.0307
## 2 Southern… Pakistan 4.3.1.1… 2017 58 337 0.172 0.132 0.212
## 3 Southern… Pakistan 4.3.1.1… 2018 422 678 0.622 0.586 0.659
## 4 Southern… Pakistan 4.3.1.1… 2019 86 244 0.352 0.293 0.412
## 5 Southern… Pakistan 4.3.1.1… 2020 27 31 0.871 0.753 0.989
## 6 Southern… Pakistan 4.3.1.1… all 594 1484 0.400 0.375 0.425
tgc_from2010_countryFreq_allGeno %>% filter(Region=="Southern Asia") %>% filter(p>0.05) %>% filter(!startsWith(Final_genotype, "4.3.1"))
## # A tibble: 7 × 8
## # Groups: Region, Country_Origin [3]
## Region Country_Origin Final_genotype x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Southern… Bangladesh 2.0.1 103 1591 0.0647 0.0526 0.0768
## 2 Southern… Bangladesh 2.3.3 274 1591 0.172 0.154 0.191
## 3 Southern… Bangladesh 3.2.2 264 1591 0.166 0.148 0.184
## 4 Southern… Bangladesh 3.3.2 93 1591 0.0585 0.0469 0.0700
## 5 Southern… India 2.5 190 2267 0.0838 0.0724 0.0952
## 6 Southern… India 3.3 150 2267 0.0662 0.0559 0.0764
## 7 Southern… Nepal 3.3.2 164 1275 0.129 0.110 0.147
# South east Asia
tgc_from2010_regionFreq_h58 %>% filter(Region=="South-eastern Asia") %>% filter(h58)
## # A tibble: 1 × 7
## # Groups: Region [1]
## Region h58 x N p p_lowerCI p_upperCI
## <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 South-eastern Asia TRUE 276 584 0.473 0.432 0.513
tgc_from2010_regionFreq_h58L1 %>% filter(Region=="South-eastern Asia") %>% filter(h58L1)
## # A tibble: 1 × 7
## # Groups: Region [1]
## Region h58L1 x N p p_lowerCI p_upperCI
## <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 South-eastern Asia TRUE 251 584 0.430 0.390 0.470
tgc_from2010_countryFreq_h58 %>% filter(Region=="South-eastern Asia") %>% filter(h58)
## # A tibble: 9 × 8
## # Groups: Region, Country_Origin [9]
## Region Country_Origin h58 x N p p_lowerCI p_upperCI
## <chr> <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 South-eastern As… Cambodia TRUE 216 221 0.977 0.958 0.997
## 2 South-eastern As… Indonesia TRUE 2 65 0.0308 0 0.0728
## 3 South-eastern As… Laos TRUE 1 27 0.0370 0 0.108
## 4 South-eastern As… Malaysia TRUE 1 3 0.333 0 0.867
## 5 South-eastern As… Myanmar TRUE 46 49 0.939 0.872 1
## 6 South-eastern As… Philippines TRUE 1 206 0.00485 0 0.0143
## 7 South-eastern As… Singapore TRUE 4 4 1 1 1
## 8 South-eastern As… Thailand TRUE 4 5 0.8 0.449 1
## 9 South-eastern As… Viet Nam TRUE 1 3 0.333 0 0.867
tgc_from2010_countryFreq_h58L1 %>% filter(Region=="South-eastern Asia") %>% filter(h58L1)
## # A tibble: 6 × 8
## # Groups: Region, Country_Origin [6]
## Region Country_Origin h58L1 x N p p_lowerCI p_upperCI
## <chr> <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 South-eastern Asia Cambodia TRUE 216 221 0.977 0.958 0.997
## 2 South-eastern Asia Laos TRUE 1 27 0.0370 0 0.108
## 3 South-eastern Asia Myanmar TRUE 29 49 0.592 0.454 0.729
## 4 South-eastern Asia Singapore TRUE 1 4 0.25 0 0.674
## 5 South-eastern Asia Thailand TRUE 3 5 0.6 0.171 1
## 6 South-eastern Asia Viet Nam TRUE 1 3 0.333 0 0.867
tgc_from2010_countryFreq_allGeno %>% filter(Region=="South-eastern Asia") %>% filter(p>0.05) %>% filter(!startsWith(Final_genotype, "4.3.1"))
## # A tibble: 18 × 8
## # Groups: Region, Country_Origin [7]
## Region Country_Origin Final_genotype x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 South-e… Indonesia 2.1 10 65 0.154 0.0661 0.242
## 2 South-e… Indonesia 3 12 65 0.185 0.0903 0.279
## 3 South-e… Indonesia 3.1.2 8 65 0.123 0.0432 0.203
## 4 South-e… Indonesia 4.1 17 65 0.262 0.155 0.368
## 5 South-e… Laos 2.3.4 3 27 0.111 0 0.230
## 6 South-e… Laos 3.2.1 3 27 0.111 0 0.230
## 7 South-e… Laos 3.4 12 27 0.444 0.257 0.632
## 8 South-e… Laos 3.5.2 4 27 0.148 0.0141 0.282
## 9 South-e… Laos 4.1 2 27 0.0741 0 0.173
## 10 South-e… Malaysia 2.2.1 1 3 0.333 0 0.867
## 11 South-e… Malaysia 3 1 3 0.333 0 0.867
## 12 South-e… Philippines 3 163 206 0.791 0.736 0.847
## 13 South-e… Philippines 3.2.1 23 206 0.112 0.0686 0.155
## 14 South-e… Philippines 4.1 16 206 0.0777 0.0411 0.114
## 15 South-e… Thailand 4.1 1 5 0.2 0 0.551
## 16 South-e… Timor-Leste 2.1 1 1 1 1 1
## 17 South-e… Viet Nam 2.1 1 3 0.333 0 0.867
## 18 South-e… Viet Nam 3.2.1 1 3 0.333 0 0.867
# Western Asia
genotypes_by_region %>% filter(Region=="Western Asia") %>% filter(Year=="all")
## # A tibble: 8 × 8
## # Groups: Region [1]
## Region Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Western Asia 0.1 all 1 21 0.0476 0 0.139
## 2 Western Asia 2 all 1 21 0.0476 0 0.139
## 3 Western Asia 2.2.1 all 1 21 0.0476 0 0.139
## 4 Western Asia 2.2.2 all 2 21 0.0952 0 0.221
## 5 Western Asia 3.3.1 all 1 21 0.0476 0 0.139
## 6 Western Asia 4.3.1 all 3 21 0.143 0 0.293
## 7 Western Asia 4.3.1.1 all 8 21 0.381 0.173 0.589
## 8 Western Asia 4.3.1.2 all 4 21 0.190 0.0225 0.358
genotypes_by_country %>% filter(Region=="Western Asia") %>% filter(Year=="all")
## # A tibble: 14 × 9
## # Groups: Region, Country_Origin [6]
## Region Country_Origin Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Western … Iraq 0.1 all 1 8 0.125 0 0.354
## 2 Western … Iraq 2 all 1 8 0.125 0 0.354
## 3 Western … Iraq 3.3.1 all 1 8 0.125 0 0.354
## 4 Western … Iraq 4.3.1.1 all 2 8 0.25 0 0.550
## 5 Western … Iraq 4.3.1.2 all 3 8 0.375 0.0395 0.710
## 6 Western … Lebanon 2.2.2 all 2 4 0.5 0.0100 0.99
## 7 Western … Lebanon 4.3.1.1 all 1 4 0.25 0 0.674
## 8 Western … Lebanon 4.3.1.2 all 1 4 0.25 0 0.674
## 9 Western … Qatar 4.3.1.1 all 1 1 1 1 1
## 10 Western … Saudi Arabia 4.3.1 all 2 3 0.667 0.133 1
## 11 Western … Saudi Arabia 4.3.1.1 all 1 3 0.333 0 0.867
## 12 Western … Syria 2.2.1 all 1 1 1 1 1
## 13 Western … United Arab E… 4.3.1 all 1 4 0.25 0 0.674
## 14 Western … United Arab E… 4.3.1.1 all 3 4 0.75 0.326 1
tgc_from2010_regionFreq_h58 %>% filter(Region=="Western Asia") %>% filter(h58)
## # A tibble: 1 × 7
## # Groups: Region [1]
## Region h58 x N p p_lowerCI p_upperCI
## <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 Western Asia TRUE 15 21 0.714 0.521 0.908
# Eastern Africa
tgc_from2010_regionFreq_h58 %>% filter(Region=="Eastern Africa") %>% filter(h58)
## # A tibble: 1 × 7
## # Groups: Region [1]
## Region h58 x N p p_lowerCI p_upperCI
## <chr> <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 Eastern Africa TRUE 774 830 0.933 0.915 0.950
genotypes_by_region %>% filter(Region=="Eastern Africa") %>% filter(Year=="all")
## # A tibble: 17 × 8
## # Groups: Region [1]
## Region Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Eastern Africa 1.2 all 1 830 0.00120 0 0.00356
## 2 Eastern Africa 2.2 all 6 830 0.00723 0.00147 0.0130
## 3 Eastern Africa 2.2.2 all 3 830 0.00361 0 0.00770
## 4 Eastern Africa 2.4.1 all 9 830 0.0108 0.00380 0.0179
## 5 Eastern Africa 2.5 all 4 830 0.00482 0.000108 0.00953
## 6 Eastern Africa 2.5.1 all 4 830 0.00482 0.000108 0.00953
## 7 Eastern Africa 2.5.2 all 4 830 0.00482 0.000108 0.00953
## 8 Eastern Africa 3 all 2 830 0.00241 0 0.00575
## 9 Eastern Africa 3.1 all 2 830 0.00241 0 0.00575
## 10 Eastern Africa 3.3.1 all 7 830 0.00843 0.00221 0.0147
## 11 Eastern Africa 4.1 all 1 830 0.00120 0 0.00356
## 12 Eastern Africa 4.1.1 all 13 830 0.0157 0.00722 0.0241
## 13 Eastern Africa 4.3.1.1 all 4 830 0.00482 0.000108 0.00953
## 14 Eastern Africa 4.3.1.1.EA1 all 647 830 0.780 0.751 0.808
## 15 Eastern Africa 4.3.1.2 all 2 830 0.00241 0 0.00575
## 16 Eastern Africa 4.3.1.2.EA2 all 46 830 0.0554 0.0399 0.0710
## 17 Eastern Africa 4.3.1.2.EA3 all 75 830 0.0904 0.0709 0.110
genotypes_by_country %>% filter(Region=="Eastern Africa") %>% filter(Year=="all") %>% filter(Genotype=="4.3.1.1.EA1")
## # A tibble: 6 × 9
## # Groups: Region, Country_Origin [6]
## Region Country_Origin Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Eastern … Kenya 4.3.1.1… all 86 149 0.577 0.498 0.657
## 2 Eastern … Malawi 4.3.1.1… all 524 558 0.939 0.919 0.959
## 3 Eastern … Mozambique 4.3.1.1… all 1 1 1 1 1
## 4 Eastern … Tanzania 4.3.1.1… all 15 18 0.833 0.661 1
## 5 Eastern … Uganda 4.3.1.1… all 1 36 0.0278 0 0.0815
## 6 Eastern … Zimbabwe 4.3.1.1… all 20 25 0.8 0.643 0.957
genotypes_by_country %>% filter(Region=="Eastern Africa") %>% filter(Year=="all") %>% filter(Genotype=="4.3.1.2.EA3")
## # A tibble: 4 × 9
## # Groups: Region, Country_Origin [4]
## Region Country_Origin Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Eastern A… Burundi 4.3.1.2… all 2 2 1 1 1
## 2 Eastern A… Kenya 4.3.1.2… all 15 149 0.101 0.0524 0.149
## 3 Eastern A… Rwanda 4.3.1.2… all 23 27 0.852 0.718 0.986
## 4 Eastern A… Uganda 4.3.1.2… all 35 36 0.972 0.919 1
genotypes_by_country %>% filter(Region=="Eastern Africa") %>% filter(Country_Origin=="Kenya") %>% filter(Year %in% c(2012, 2013, 2014, 2015, 2016)) %>% filter(Genotype=="4.3.1.1.EA1") %>% summarise(n=sum(x), N=sum(N), p=n/N)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## # A tibble: 1 × 5
## # Groups: Region [1]
## Region Country_Origin n N p
## <chr> <chr> <int> <int> <dbl>
## 1 Eastern Africa Kenya 86 145 0.593
genotypes_by_country %>% filter(Region=="Eastern Africa") %>% filter(Country_Origin=="Kenya") %>% filter(Year %in% c(2017, 2018, 2019)) %>% filter(Genotype=="4.3.1.2.EA3") %>% summarise(n=sum(x), N=sum(N), p=n/N)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## # A tibble: 1 × 5
## # Groups: Region [1]
## Region Country_Origin n N p
## <chr> <chr> <int> <int> <dbl>
## 1 Eastern Africa Kenya 4 4 1
# Southern Africa
tgc_from2010 %>% filter(Region=="Southern Africa") %>% group_by(Country_Origin) %>% summarise(n=n())
## # A tibble: 2 × 2
## Country_Origin n
## <chr> <int>
## 1 Namibia 1
## 2 South Africa 285
tgc_from2010 %>% filter(Region=="Southern Africa") %>% group_by(Country_Origin, Year) %>% summarise(n=n())
## `summarise()` has grouped output by 'Country_Origin'. You can override using the
## `.groups` argument.
## # A tibble: 9 × 3
## # Groups: Country_Origin [2]
## Country_Origin Year n
## <chr> <chr> <int>
## 1 Namibia 2017 1
## 2 South Africa 2010 5
## 3 South Africa 2011 3
## 4 South Africa 2012 8
## 5 South Africa 2016 7
## 6 South Africa 2017 76
## 7 South Africa 2018 61
## 8 South Africa 2019 81
## 9 South Africa 2020 44
tgc_from2010 %>% filter(Country_Origin=="South Africa") %>% filter(Year %in% c(2017, 2018, 2019, 2020)) %>% summarise(n=n())
## # A tibble: 1 × 1
## n
## <int>
## 1 262
tgc_from2010 %>% filter(Country_Origin=="South Africa") %>% filter(Year %in% c(2017, 2018, 2019, 2020)) %>% mutate(h58=startsWith(Final_genotype,"4.3.1")) %>% group_by(h58) %>% summarise(n=n()) %>% mutate(p=n/sum(n), N=sum(n))
## # A tibble: 2 × 4
## h58 n p N
## <lgl> <int> <dbl> <int>
## 1 FALSE 80 0.305 262
## 2 TRUE 182 0.695 262
tgc_from2010 %>% filter(Country_Origin=="South Africa") %>% filter(Year %in% c(2010, 2011, 2012)) %>% mutate(h58=startsWith(Final_genotype,"4.3.1")) %>% group_by(h58) %>% summarise(n=n()) %>% mutate(p=n/sum(n), N=sum(n))
## # A tibble: 2 × 4
## h58 n p N
## <lgl> <int> <dbl> <int>
## 1 FALSE 12 0.75 16
## 2 TRUE 4 0.25 16
tgc_from2010 %>% filter(Country_Origin=="South Africa") %>% filter(Year %in% c(2017, 2018, 2019, 2020)) %>% group_by(Final_genotype) %>% summarise(n=n()) %>% mutate(p=n/sum(n), N=sum(n))
## # A tibble: 18 × 4
## Final_genotype n p N
## <chr> <int> <dbl> <int>
## 1 1.1.2 3 0.0115 262
## 2 2.1 1 0.00382 262
## 3 2.2 4 0.0153 262
## 4 2.3.2 1 0.00382 262
## 5 2.4 20 0.0763 262
## 6 2.4.1 14 0.0534 262
## 7 2.5 11 0.0420 262
## 8 2.5.1 2 0.00763 262
## 9 3 1 0.00382 262
## 10 3.1 2 0.00763 262
## 11 3.1.1 1 0.00382 262
## 12 3.2.1 1 0.00382 262
## 13 3.3 1 0.00382 262
## 14 3.3.1 18 0.0687 262
## 15 4.3.1.1 9 0.0344 262
## 16 4.3.1.1.EA1 168 0.641 262
## 17 4.3.1.2 4 0.0153 262
## 18 4.3.1.2.EA3 1 0.00382 262
# Western Africa
genotypes_by_region %>% filter(Region=="Western Africa") %>% filter(Year=="all")
## # A tibble: 11 × 8
## # Groups: Region [1]
## Region Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Western Africa 0.0.1 all 2 267 0.00749 0 0.0178
## 2 Western Africa 0.0.3 all 7 267 0.0262 0.00705 0.0454
## 3 Western Africa 2.1 all 2 267 0.00749 0 0.0178
## 4 Western Africa 2.2 all 17 267 0.0637 0.0344 0.0930
## 5 Western Africa 2.3.1 all 15 267 0.0562 0.0286 0.0838
## 6 Western Africa 2.3.2 all 37 267 0.139 0.0971 0.180
## 7 Western Africa 3.1 all 1 267 0.00375 0 0.0111
## 8 Western Africa 3.1.1 all 172 267 0.644 0.587 0.702
## 9 Western Africa 3.3 all 1 267 0.00375 0 0.0111
## 10 Western Africa 4.1 all 10 267 0.0375 0.0147 0.0602
## 11 Western Africa 4.1.1 all 3 267 0.0112 0 0.0239
tgc_all %>% filter(Region=="Western Africa") %>% filter(Final_genotype=="3.1.1") %>% group_by(Year, Country_Origin) %>% summarise(n=n()) %>% arrange(Country_Origin, Year)
## `summarise()` has grouped output by 'Year'. You can override using the `.groups`
## argument.
## # A tibble: 37 × 3
## # Groups: Year [16]
## Year Country_Origin n
## <chr> <chr> <int>
## 1 2002 Benin 1
## 2 2004 Benin 2
## 3 2009 Benin 1
## 4 2006 Burkina Faso 3
## 5 2012 Burkina Faso 2
## 6 2013 Burkina Faso 6
## 7 2006 Cote d'Ivoire 1
## 8 2007 Cote d'Ivoire 2
## 9 2008 Cote d'Ivoire 1
## 10 2015 Gambia 1
## # … with 27 more rows
tgc_all %>% filter(Region=="Western Africa") %>% filter(Final_genotype=="2.3.2") %>% group_by(Year, Country_Origin) %>% summarise(n=n()) %>% arrange(Country_Origin, Year)
## `summarise()` has grouped output by 'Year'. You can override using the `.groups`
## argument.
## # A tibble: 26 × 3
## # Groups: Year [16]
## Year Country_Origin n
## <chr> <chr> <int>
## 1 2012 Burkina Faso 1
## 2 2013 Burkina Faso 1
## 3 2008 Gambia 3
## 4 2009 Gambia 1
## 5 2010 Gambia 1
## 6 2011 Gambia 3
## 7 2012 Gambia 7
## 8 2013 Gambia 5
## 9 2014 Gambia 3
## 10 2010 Ghana 2
## # … with 16 more rows
# Middle Africa
genotypes_by_region %>% filter(Region=="Middle Africa") %>% filter(Year=="all")
## # A tibble: 4 × 8
## # Groups: Region [1]
## Region Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Middle Africa 2.1 all 2 21 0.0952 0 0.221
## 2 Middle Africa 2.5.1 all 16 21 0.762 0.580 0.944
## 3 Middle Africa 4.1.1 all 2 21 0.0952 0 0.221
## 4 Middle Africa 4.3.1.2.EA3 all 1 21 0.0476 0 0.139
genotypes_by_country %>% filter(Region=="Middle Africa") %>% filter(Year=="all")
## # A tibble: 4 × 9
## # Groups: Region, Country_Origin [3]
## Region Country_Origin Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Middle A… Angola 4.1.1 all 2 2 1 1 1
## 2 Middle A… Chad 2.1 all 2 2 1 1 1
## 3 Middle A… Democratic Re… 2.5.1 all 16 17 0.941 0.829 1
## 4 Middle A… Democratic Re… 4.3.1.2… all 1 17 0.0588 0 0.171
tgc_from2010 %>% filter(Region=="Middle Africa") %>% select(Final_genotype, Year, Country_Origin, Country, `Travel Associated`, `Isolating Lab`)
## # A tibble: 21 × 6
## Final_genotype Year Country_Origin Country `Travel Associ…` `Isolating Lab`
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2.1 2019 Chad France Yes IPA
## 2 2.1 2019 Chad France Yes IPA
## 3 2.5.1 2016 Democratic Rep… USA Yes CDC
## 4 4.1.1 2015 Angola United… Yes PHE
## 5 4.1.1 2015 Angola United… Yes PHE
## 6 4.3.1.2.EA3 2018 Democratic Rep… United… Yes PHE
## 7 2.5.1 2011 Democratic Rep… Democr… No INR
## 8 2.5.1 2010 Democratic Rep… Democr… No INR
## 9 2.5.1 2010 Democratic Rep… Democr… No INR
## 10 2.5.1 2011 Democratic Rep… Democr… No INR
## # … with 11 more rows
# Northern Africa
tgc_from2010 %>% filter(Region=="Northern Africa") %>% select(Final_genotype, Year, Country_Origin, Country, `Travel Associated`, `Isolating Lab`)
## # A tibble: 6 × 6
## Final_genotype Year Country_Origin Country `Travel Associ…` `Isolating Lab`
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1.1 2017 Morocco USA Yes CDC
## 2 0.1 2015 Egypt United K… Yes PHE
## 3 0.1 2017 Morocco United K… Yes PHE
## 4 4 2018 Sudan United K… Yes PHE
## 5 4 2018 Sudan United K… Yes PHE
## 6 3.3 2019 Tunisia United K… Yes PHE
# Central America
genotypes_by_region %>% filter(Region=="Central America") %>% filter(Year=="all")
## # A tibble: 7 × 8
## # Groups: Region [1]
## Region Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Central America 2 all 1 100 0.01 0 0.0295
## 2 Central America 2.0.2 all 24 100 0.24 0.156 0.324
## 3 Central America 2.3.2 all 55 100 0.55 0.452 0.648
## 4 Central America 4.1 all 17 100 0.17 0.0964 0.244
## 5 Central America 4.3.1.1 all 1 100 0.01 0 0.0295
## 6 Central America 4.3.1.1.P1 all 1 100 0.01 0 0.0295
## 7 Central America 4.3.1.2 all 1 100 0.01 0 0.0295
genotypes_by_country %>% filter(Region=="Central America") %>% filter(Year=="all")
## # A tibble: 12 × 9
## # Groups: Region, Country_Origin [4]
## Region Country_Origin Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Central… El Salvador 2.0.2 all 2 19 0.105 0 0.243
## 2 Central… El Salvador 2.3.2 all 17 19 0.895 0.757 1
## 3 Central… Guatemala 2 all 1 22 0.0455 0 0.132
## 4 Central… Guatemala 2.0.2 all 7 22 0.318 0.124 0.513
## 5 Central… Guatemala 2.3.2 all 9 22 0.409 0.204 0.615
## 6 Central… Guatemala 4.1 all 5 22 0.227 0.0522 0.402
## 7 Central… Mexico 2.0.2 all 15 58 0.259 0.146 0.371
## 8 Central… Mexico 2.3.2 all 29 58 0.5 0.371 0.629
## 9 Central… Mexico 4.1 all 12 58 0.207 0.103 0.311
## 10 Central… Mexico 4.3.1.1… all 1 58 0.0172 0 0.0507
## 11 Central… Mexico 4.3.1.2 all 1 58 0.0172 0 0.0507
## 12 Central… Panama 4.3.1.1 all 1 1 1 1 1
tgc_all %>% filter(Region=="Central America") %>% group_by(`Isolating Lab`) %>% summarise(n=n())
## # A tibble: 5 × 2
## `Isolating Lab` n
## <chr> <int>
## 1 CDC 89
## 2 ESR 1
## 3 IPA 3
## 4 MDU 2
## 5 PHE 8
tgc_from2010 %>% filter(Region=="Central America") %>% group_by(Country_Origin) %>% summarise(n=n())
## # A tibble: 4 × 2
## Country_Origin n
## <chr> <int>
## 1 El Salvador 19
## 2 Guatemala 22
## 3 Mexico 58
## 4 Panama 1
tgc_from2010 %>% filter(Region=="Central America") %>% group_by(Country_Origin, Year) %>% summarise(n=n())
## `summarise()` has grouped output by 'Country_Origin'. You can override using the
## `.groups` argument.
## # A tibble: 15 × 3
## # Groups: Country_Origin [4]
## Country_Origin Year n
## <chr> <chr> <int>
## 1 El Salvador 2012 1
## 2 El Salvador 2016 3
## 3 El Salvador 2017 2
## 4 El Salvador 2018 2
## 5 El Salvador 2019 11
## 6 Guatemala 2016 5
## 7 Guatemala 2017 3
## 8 Guatemala 2018 3
## 9 Guatemala 2019 11
## 10 Mexico 2011 1
## 11 Mexico 2016 6
## 12 Mexico 2017 13
## 13 Mexico 2018 8
## 14 Mexico 2019 30
## 15 Panama 2016 1
tgc_from2010 %>% filter(Region=="Central America") %>% group_by(Country_Origin, Final_genotype) %>% summarise(n=n()) %>% mutate(p=n/sum(n))
## `summarise()` has grouped output by 'Country_Origin'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups: Country_Origin [4]
## Country_Origin Final_genotype n p
## <chr> <chr> <int> <dbl>
## 1 El Salvador 2.0.2 2 0.105
## 2 El Salvador 2.3.2 17 0.895
## 3 Guatemala 2 1 0.0455
## 4 Guatemala 2.0.2 7 0.318
## 5 Guatemala 2.3.2 9 0.409
## 6 Guatemala 4.1 5 0.227
## 7 Mexico 2.0.2 15 0.259
## 8 Mexico 2.3.2 29 0.5
## 9 Mexico 4.1 12 0.207
## 10 Mexico 4.3.1.1.P1 1 0.0172
## 11 Mexico 4.3.1.2 1 0.0172
## 12 Panama 4.3.1.1 1 1
tgc_all %>% filter(Region=="Central America") %>% filter(Year <2010) %>% select(Final_genotype, `Isolating Lab`, Country_Origin, Year)
## # A tibble: 3 × 4
## Final_genotype `Isolating Lab` Country_Origin Year
## <chr> <chr> <chr> <chr>
## 1 2.3.2 IPA Mexico 1998
## 2 4.1 IPA Mexico 1998
## 3 2.3.2 IPA Mexico 1972
# Southern America
genotypes_by_region %>% filter(Region=="South America") %>% filter(Year=="all")
## # A tibble: 14 × 8
## # Groups: Region [1]
## Region Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 South America 1.1 all 19 105 0.181 0.107 0.255
## 2 South America 1.2.1 all 6 105 0.0571 0.0127 0.102
## 3 South America 2 all 19 105 0.181 0.107 0.255
## 4 South America 2.0.2 all 6 105 0.0571 0.0127 0.102
## 5 South America 2.2 all 3 105 0.0286 0 0.0604
## 6 South America 2.3.2 all 1 105 0.00952 0 0.0281
## 7 South America 2.3.3 all 5 105 0.0476 0.00689 0.0884
## 8 South America 2.5 all 4 105 0.0381 0.00148 0.0747
## 9 South America 3 all 1 105 0.00952 0 0.0281
## 10 South America 3.3 all 1 105 0.00952 0 0.0281
## 11 South America 3.5 all 28 105 0.267 0.182 0.351
## 12 South America 4.1 all 5 105 0.0476 0.00689 0.0884
## 13 South America 4.3.1.1 all 2 105 0.0190 0 0.0452
## 14 South America 4.3.1.2 all 5 105 0.0476 0.00689 0.0884
tgc_from2010 %>% filter(Region=="South America") %>% group_by(Country_Origin) %>% summarise(n=n()) %>% mutate(p=n/sum(n))
## # A tibble: 5 × 3
## Country_Origin n p
## <chr> <int> <dbl>
## 1 Brazil 1 0.00952
## 2 Chile 97 0.924
## 3 French Guiana 3 0.0286
## 4 Peru 3 0.0286
## 5 Suriname 1 0.00952
genotypes_by_region %>% filter(Region=="South America") %>% filter(Year=="all") %>% filter(p>0.05) %>% arrange(p)
## # A tibble: 5 × 8
## # Groups: Region [1]
## Region Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 South America 1.2.1 all 6 105 0.0571 0.0127 0.102
## 2 South America 2.0.2 all 6 105 0.0571 0.0127 0.102
## 3 South America 1.1 all 19 105 0.181 0.107 0.255
## 4 South America 2 all 19 105 0.181 0.107 0.255
## 5 South America 3.5 all 28 105 0.267 0.182 0.351
tgc_all %>% filter(Country_Origin=="Colombia") %>% select(Final_genotype, Year) %>% group_by(Final_genotype, Year) %>% summarise(n=n()) %>% mutate(p=n/sum(n))
## `summarise()` has grouped output by 'Final_genotype'. You can override using the
## `.groups` argument.
## # A tibble: 30 × 4
## # Groups: Final_genotype [4]
## Final_genotype Year n p
## <chr> <chr> <int> <dbl>
## 1 1.1 2004 1 1
## 2 2 1997 1 0.2
## 3 2 2000 1 0.2
## 4 2 2003 1 0.2
## 5 2 2004 1 0.2
## 6 2 2018 1 0.2
## 7 2.5 1999 4 0.0784
## 8 2.5 2000 4 0.0784
## 9 2.5 2005 4 0.0784
## 10 2.5 2006 1 0.0196
## # … with 20 more rows
tgc_all %>% filter(Country_Origin=="French Guiana") %>% select(Final_genotype, Year, `Isolating Lab`)
## # A tibble: 5 × 3
## Final_genotype Year `Isolating Lab`
## <chr> <chr> <chr>
## 1 2.5 2002 IPA
## 2 2.5 2002 IPA
## 3 2.5 2016 IPA
## 4 2.5 2018 IPA
## 5 2.5 2019 IPA
tgc_all %>% filter(Final_genotype=="2.3.2") %>% group_by(Region, Country_Origin) %>% summarise(n=n())
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## # A tibble: 32 × 3
## # Groups: Region [15]
## Region Country_Origin n
## <chr> <chr> <int>
## 1 Caribbean Dominican Republic 1
## 2 Caribbean Haiti 4
## 3 Caribbean Turks and Caicos Islands 1
## 4 Central America El Salvador 17
## 5 Central America Guatemala 9
## 6 Central America Mexico 31
## 7 Middle Africa Angola 1
## 8 Northern America USA 21
## 9 Northern Europe United Kingdom 2
## 10 Polynesia Samoa 1
## # … with 22 more rows
# Pacific Islands
genotypes_by_country %>% filter(Country_Origin=="Papua New Guinea") %>% filter(Year=="all")
## # A tibble: 1 × 9
## # Groups: Region, Country_Origin [1]
## Region Country_Origin Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Melanesia Papua New Guin… 2.1.7.1 all 5 5 1 1 1
genotypes_by_country %>% filter(Country_Origin=="Samoa") %>% filter(Year=="all")
## # A tibble: 4 × 9
## # Groups: Region, Country_Origin [1]
## Region Country_Origin Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Polynes… Samoa 2.2.1 all 1 259 0.00386 0 0.0114
## 2 Polynes… Samoa 3.5.3 all 3 259 0.0116 0 0.0246
## 3 Polynes… Samoa 3.5.4 all 246 259 0.950 0.923 0.976
## 4 Polynes… Samoa 4.1 all 9 259 0.0347 0.0124 0.0571
genotypes_by_country %>% filter(Country_Origin=="Fiji") %>% filter(Year=="all")
## # A tibble: 3 × 9
## # Groups: Region, Country_Origin [1]
## Region Country_Origin Genotype Year x N p p_lowerCI p_upperCI
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Melanesia Fiji 3.3.1 all 1 32 0.0312 0 0.0915
## 2 Melanesia Fiji 4.2.1 all 1 32 0.0312 0 0.0915
## 3 Melanesia Fiji 4.2.2 all 30 32 0.938 0.854 1
# genotype freqs per region, from 2010
region_N <- tgc_from2010 %>%
group_by(Region)%>%
summarize(region_total =n())
# genotype freqs per country, from 2010
country_N <- tgc_from2010 %>%
group_by(Country_Origin)%>%
summarize(country_total =n())
country_geno <- tgc_from2010 %>%
group_by(Country_Origin, Final_genotype)%>%
summarize(n =n()) %>%
left_join(country_N) %>% # country counts
mutate(geno_freq_country = round(n/country_total*100,0)) # percent per region
## `summarise()` has grouped output by 'Country_Origin'. You can override using the `.groups` argument.
## Joining, by = "Country_Origin"
# genotypes with ≥20% freq in any country; note this includes all H58 subtypes already
geno_country20 <- names(table(country_geno$Final_genotype[country_geno$geno_freq_country>=20]))
# set genotype colours
geno_info <- read_csv("https://raw.githubusercontent.com/katholt/genotyphi/main/Genotype_specification.csv") %>% filter(Genotype %in% geno_country20)
## Rows: 82 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Genotype, H58 Genotype, Gene in CT18 reference sequence, Product, ...
## dbl (1): Position in CT18 reference sequence
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
geno_colours <- c(geno_info$`Colour code`,"grey")
names(geno_colours) <- c(geno_info$Genotype, "Other")
geno_colours <- geno_colours[order(names(geno_colours))] # sort for legend
# distinguish H58 subtypes
geno_colours2 <- geno_colours
geno_colours2["4.3.1.1.EA1"] = "#c18cda"
geno_colours2["4.3.1.2.EA2"] = "#ee37c9"
geno_colours2["4.3.1.2.EA3"] = "#94165f"
# read gps coordinates
gps <- read.csv("../UN_region_coords.csv")%>%
select(Region, latitude, longitude)
# pie data, 2010 - 2020, genotype >20% prevalent in an individual country since 2010
tgc_from2010_toMap_countryGeno <- tgc_from2010 %>%
mutate(map_genotype = if_else(Final_genotype %in% geno_country20, Final_genotype, "Other"))%>%
select(Region, Year, Final_genotype, map_genotype)%>%
dcast(Region ~ map_genotype)%>%
left_join(gps)%>%
left_join(region_N)%>%
mutate(lat = as.numeric(latitude),
long = as.numeric(longitude),
mytext=paste(
Region, "\n",
"N= ", region_total, sep="")
)
## Using map_genotype as value column: use value.var to override.
## Aggregation function missing: defaulting to length
## Joining, by = "Region"
## Joining, by = "Region"
typhi_countryN <- tgc_meta %>%
filter(Year >= 2010) %>% # restrict to last 10 years
group_by(Country_Origin)%>%
summarize(CountryCount2010=n())
# match map object country names
typhi_countryN$Country_Origin <- replace(typhi_countryN$Country_Origin, typhi_countryN$Country_Origin=="United Kingdom", "UK")
typhi_countryN$Country_Origin <- replace(typhi_countryN$Country_Origin, typhi_countryN$Country_Origin=="Viet Nam", "Vietnam")
# add N per country
typhi_Nmap <- map_data('world') %>%
group_by(region) %>%
# Merge in Typhi counts
left_join(typhi_countryN, by = c('region' = 'Country_Origin'))
typhi_Nmap$CountryData <- typhi_Nmap$CountryCount2010 > 0
genotype_pie_map_countryGeno_nolegend <- ggplot(data=typhi_Nmap, aes( x = long, y = lat, group=group)) +
geom_polygon(data=typhi_Nmap, aes(fill = CountryData), color="grey") +
scale_fill_manual(values=c("#f2e6d9"), na.value="white") + # pale brown
new_scale_fill() +
geom_scatterpie(data = tgc_from2010_toMap_countryGeno, aes(x= long, y=lat , group = Region, r = 9),
cols= c(geno_country20, "Other"), color=NA, alpha=1) +
scale_fill_manual(values = geno_colours2)+
labs(fill = "Genotype")+
ylab("")+
xlab("")+
theme_minimal()+
theme(axis.text = element_blank(),
axis.line = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
legend.title = element_text(face = "bold"),
legend.position="none") +
ggtitle("a) Genotype prevalence by world region, 2010-2020")
genotype_pie_map_countryGeno_nolegend
## Warning: Removed 240 rows containing non-finite values (stat_pie).
#select for 2010 - 2020, H58 genotypes plus any genotype >20% prevalent in an individual country
country_year_data_39geno <- tgc_meta %>%
mutate(map_genotype = if_else(Final_genotype %in% geno_country20, Final_genotype, "Other"))%>%
filter(Year >=2010)
region <- table(country_year_data_39geno$Region)
region <- names(region[region>=20])
region_facet_39geno <- country_year_data_39geno %>%
filter(Year <=2020) %>%
filter(Region %in% region) %>%
ggplot(aes(fill=map_genotype, x=Year)) +
geom_bar(position="fill") +
scale_fill_manual(values = geno_colours2)+
labs(fill = "Genotype")+
ylab("")+
xlab("")+
facet_wrap(~Region , ncol=3) +
theme_bw() +
theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=8, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"), legend.text = element_text(size=8), legend.key.size=unit(0.25, "cm"), legend.title = element_text(size=8, face="bold")) +
guides(fill=guide_legend(ncol=1))
region_facet_39geno
pdf(file=paste0("SuppFig2_region_facet.pdf"),width=7,height=8)
region_facet_39geno
dev.off()
## quartz_off_screen
## 2
png(paste0("SuppFig2_region_facet.png"), width=7, height=8, units="in", res=400)
region_facet_39geno
dev.off()
## quartz_off_screen
## 2
countryN <- table(tgc_from2010$Country_Origin)
country50 <- names(countryN[countryN>=50])
country_facet_39geno_exclKey <- country_year_data_39geno %>%
filter(!(Country_Origin %in% country50)) %>%
ggplot(aes(fill=map_genotype, x=Year)) +
geom_bar(position="fill") +
scale_fill_manual(values = geno_colours2)+
labs(fill = "Genotype")+
ylab("")+
xlab("")+
facet_wrap(~Region + Country_Origin, ncol=6) +
theme_bw() +
theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=8, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"), legend.text = element_text(size=10), legend.key.size=unit(0.25, "cm"), legend.title = element_text(size=10, face="bold")) +
guides(fill=guide_legend(ncol=1))
country_facet_39geno_exclKey
pdf(file=paste0("SuppFig3_country_facet_exclKey.pdf"),width=13,height=18)
country_facet_39geno_exclKey
dev.off()
## quartz_off_screen
## 2
png(file=paste0("SuppFig3_country_facet_exclKey.png"),width=13,height=18, units="in", res=400)
country_facet_39geno_exclKey
dev.off()
## quartz_off_screen
## 2
#select for 2010 - 2020, H58 genotypes plus any genotype >20% prevalent in an individual country
country_facet_39geno_keycountries <- tgc_meta %>%
mutate(map_genotype = if_else(Final_genotype %in% geno_country20, Final_genotype, "Other"))%>%
select(Year, map_genotype, Region, Country_Origin)%>%
filter(Year >=2010 & Year <=2020) %>%
filter(Country_Origin %in% country50) %>%
filter(!(Country_Origin %in% c("United Kingdom","USA"))) %>%
ggplot(aes(fill=map_genotype, x=Year)) +
geom_bar(position="fill") +
scale_fill_manual(values = geno_colours2)+
labs(fill = "Genotype")+
ylab("")+
xlab("")+
facet_wrap(~Region + Country_Origin, ncol=3) +
theme_bw() +
theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=8, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"), legend.text = element_text(size=8), legend.key.size=unit(0.25, "cm"), legend.title = element_text(size=8, face="bold")) +
guides(fill=guide_legend(ncol=1))+
ggtitle("b) Annual genotype prevalence, for countries with at least 50 genomes")
country_facet_39geno_keycountries
pdf(file=paste0("Fig1_map_genotypesMainCountries_facet.pdf"),width=7,height=9)
genotype_pie_map_countryGeno_nolegend / (country_facet_39geno_keycountries + guide_area() + plot_layout(widths=c(5,1))) + plot_layout(nrow=2, heights=c(2,3))
## Warning: Removed 240 rows containing non-finite values (stat_pie).
dev.off()
## quartz_off_screen
## 2
png(file=paste0("Fig1_map_genotypesMainCountries_facet.png"),width=7,height=9, units="in", res=400)
genotype_pie_map_countryGeno_nolegend / (country_facet_39geno_keycountries + guide_area() + plot_layout(widths=c(5,1))) + plot_layout(nrow=2, heights=c(2,3))
## Warning: Removed 240 rows containing non-finite values (stat_pie).
dev.off()
## quartz_off_screen
## 2
mdr_genotypes_key_countries <- tgc_meta %>%
select(Year, Final_genotype, Region, Country_Origin, MDR)%>%
filter(Year >=2010 & Year <=2020) %>%
filter(Country_Origin %in% country50) %>%
filter(!(Country_Origin %in% c("United Kingdom","USA"))) %>%
mutate(MDR_geno = ifelse(MDR, Final_genotype, "Non-MDR"))
geno_colours_mdr <- geno_colours[names(table(mdr_genotypes_key_countries$MDR_geno))]
geno_colours_mdr["4.3.1.3"] <- geno_colours["4.3.1.3.Bdq"]
geno_colours_mdr["Non-MDR"] <- "lightgrey"
geno_colours_mdr <- na.omit(geno_colours_mdr)
mdr_genotypes_key_countries$MDR_geno <- factor(mdr_genotypes_key_countries$MDR_geno, levels=names(table(mdr_genotypes_key_countries$MDR_geno))[c(12,1:11)], ordered=T)
country_facet_39geno_keycountries_MDR <- mdr_genotypes_key_countries %>%
ggplot(aes(fill=MDR_geno, x=Year)) +
geom_bar(position="fill") +
scale_fill_manual(values = geno_colours_mdr)+
labs(fill = "Genotype")+
ylab("")+
xlab("")+
facet_wrap(~Region + Country_Origin, ncol=3) +
theme_bw() +
theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=10, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, colour = "black"), legend.text = element_text(size=8), legend.key.size=unit(0.25, "cm"), legend.title = element_text(size=8, face="bold")) +
guides(fill=guide_legend(ncol=1))+
ggtitle("a) Annual MDR genotype prevalence, for countries with at least 50 genomes")
country_facet_39geno_keycountries_MDR
cipNS_genotypes_key_countries <- tgc_meta %>%
select(Year, Final_genotype, Region, Country_Origin, cipNS)%>%
filter(Year >=2010 & Year <=2020) %>%
filter(Country_Origin %in% country50) %>%
filter(!(Country_Origin %in% c("United Kingdom","USA"))) %>%
mutate(cipNS_geno = ifelse(cipNS, Final_genotype, "CipS"))
geno_cip <- names(table(cipNS_genotypes_key_countries$Final_genotype)[table(cipNS_genotypes_key_countries$Final_genotype)>=10])
geno_colours_cipI <- read_csv("https://raw.githubusercontent.com/katholt/genotyphi/main/Genotype_specification.csv") %>% filter(Genotype %in% geno_cip)
## Rows: 82 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Genotype, H58 Genotype, Gene in CT18 reference sequence, Product, ...
## dbl (1): Position in CT18 reference sequence
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
geno_colours_cipNS <- c(geno_colours_cipI$`Colour code`, "grey")
names(geno_colours_cipNS) <- c(geno_colours_cipI$Genotype, "Other")
geno_colours_cipNS <- geno_colours_cipNS[order(names(geno_colours_cipNS))]
geno_colours_cipNS["CipS"] <- "lightgrey"
cipNS_genotypes_key_countries$cipNS_geno <- factor(cipNS_genotypes_key_countries$cipNS_geno, levels=c("CipS",geno_cip), ordered=T)
country_facet_39geno_keycountries_cipNS <- cipNS_genotypes_key_countries %>%
mutate(group_genotype = if_else(Final_genotype %in% geno_colours_cipI, Final_genotype, "Other"))%>%
ggplot(aes(fill=cipNS_geno, x=Year)) +
geom_bar(position="fill") +
scale_fill_manual(values = geno_colours_cipNS)+
labs(fill = "Genotype")+
ylab("")+
xlab("")+
facet_wrap(~Region + Country_Origin, ncol=3) +
theme_bw() +
theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=12, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, colour = "black"), legend.text = element_text(size=8), legend.key.size=unit(0.25, "cm"), legend.title = element_text(size=8, face="bold")) +
guides(fill=guide_legend(ncol=1))+
ggtitle("b) Annual CipNS genotype prevalence, for countries with at least 50 genomes")
country_facet_39geno_keycountries_cipNS
pdf(file=paste0("SuppFig7_genotypesMainCountries_MDR_CipNS.pdf"),width=9,height=12)
country_facet_39geno_keycountries_MDR / country_facet_39geno_keycountries_cipNS
dev.off()
## quartz_off_screen
## 2
png(file=paste0("SuppFig7_genotypesMainCountries_MDR_CipNS.png"),width=9,height=12, units="in", res=400)
country_facet_39geno_keycountries_MDR / country_facet_39geno_keycountries_cipNS
dev.off()
## quartz_off_screen
## 2
#select for 2010 - 2020, H58 genotypes plus any genotype >20% prevalent in an individual country
country_lab_year_data <- tgc_from2010 %>%
mutate(map_genotype = if_else(Final_genotype %in% geno_country20, Final_genotype, "Other"))%>%
mutate(Country_lab_label = paste(Region, ":", Country_Origin, ":", `Isolating Lab`)) %>%
select(Country_lab_label, Year, map_genotype, Final_genotype, Region, Country_Origin, `Isolating Lab`)%>%
filter(Year >=2010)
country_lab_year_data_SA <- country_lab_year_data %>%
filter(Country_Origin %in% c("Bangladesh", "India", "Nepal", "Pakistan"))
country_lab_year_data_SA <- country_lab_year_data_SA %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CDC", "*US (CDC)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PHE", "*England (PHE)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="MDU", "*Australia (MDU)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="ESR", "*New Zealand (ESR)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CHR", "Dhaka (CHR)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="ICD", "Dhaka (ICD)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CMH", "Chittagong (ICD)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="AII", "New Delhi (AII)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CML", "Ludhiana (CML)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CNH", "New Delhi (CNH)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KHM", "Mumbai (KHM)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KKC", "Chennai (KKC)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="RDT", "Bathalapalli (RDT)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="SAP", "New Delhi (SAP)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CMC", "Vellore (CMC)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KIMS", "Mumbai (KIM)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PGI", "Chandigarh (PGI)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="SJB", "Bengaluru (SJB)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KUH", "Dhulikhel (KUH)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PAH", "Lalitpur (PAH)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="AKU", "Karachi (AKU)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="JHL", "Lahore (JHL)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="SEFI", "Multi-site (SEFI)"))
lab_N_SA <- country_lab_year_data_SA %>%
group_by(Country_Origin, `Isolating Lab`)%>%
summarize(lab_total =n())
## `summarise()` has grouped output by 'Country_Origin'. You can override using the
## `.groups` argument.
lab_N_SA
## # A tibble: 46 × 3
## # Groups: Country_Origin [4]
## Country_Origin `Isolating Lab` lab_total
## <chr> <chr> <int>
## 1 Bangladesh *Australia (MDU) 20
## 2 Bangladesh *England (PHE) 104
## 3 Bangladesh *US (CDC) 88
## 4 Bangladesh Chittagong (ICD) 26
## 5 Bangladesh Dhaka (CHR) 821
## 6 Bangladesh Dhaka (ICD) 528
## 7 Bangladesh IDS 4
## 8 India *Australia (MDU) 182
## 9 India *England (PHE) 441
## 10 India *New Zealand (ESR) 47
## # … with 36 more rows
country_lab_data_SA_genotypeFreq <- country_lab_year_data_SA %>%
group_by(Country_Origin, `Isolating Lab`, Final_genotype) %>%
summarise (n=n())%>%
left_join(lab_N_SA) %>% # annual lab counts
mutate(lab_freq = n/lab_total,
lab_freq_sd = sqrt((lab_freq*(1-lab_freq))/lab_total),
min=lab_freq-1.96*lab_freq_sd,
max=lab_freq+1.96*lab_freq_sd)
## `summarise()` has grouped output by 'Country_Origin', 'Isolating Lab'. You can override using the `.groups` argument.
## Joining, by = c("Country_Origin", "Isolating Lab")
# 4.3.1.2 in India
country_lab_data_SA_genotypeFreq %>% filter(Country_Origin=="India") %>% filter(Final_genotype=="4.3.1.2")
## # A tibble: 22 × 9
## # Groups: Country_Origin, Isolating Lab [22]
## Country_Origin `Isolating Lab` Final_genotype n lab_total lab_freq
## <chr> <chr> <chr> <int> <int> <dbl>
## 1 India *Australia (MDU) 4.3.1.2 76 182 0.418
## 2 India *England (PHE) 4.3.1.2 223 441 0.506
## 3 India *New Zealand (ESR) 4.3.1.2 17 47 0.362
## 4 India *US (CDC) 4.3.1.2 180 359 0.501
## 5 India Bathalapalli (RDT) 4.3.1.2 5 31 0.161
## 6 India Bengaluru (SJB) 4.3.1.2 87 124 0.702
## 7 India CCH 4.3.1.2 1 4 0.25
## 8 India Chandigarh (PGI) 4.3.1.2 93 237 0.392
## 9 India Chennai (KKC) 4.3.1.2 54 81 0.667
## 10 India DCH 4.3.1.2 1 4 0.25
## # … with 12 more rows, and 3 more variables: lab_freq_sd <dbl>, min <dbl>,
## # max <dbl>
# add pooled estimate
country_lab_data_SA_genotypeFreq <- tgc_from2010_countryFreq_allGeno %>%
filter(Country_Origin %in% c("India", "Bangladesh","Nepal","Pakistan")) %>%
rename(lab_freq=p, min=p_lowerCI, max=p_upperCI, lab_total=N, n=x) %>%
mutate(`Isolating Lab`="(Pooled)") %>%
bind_rows(country_lab_data_SA_genotypeFreq) %>%
ungroup() %>%
select(-c(Region, lab_freq_sd))
# choose most common genotypes to plot
SA_geno_freq <- rev(sort(table(country_lab_year_data_SA$Final_genotype)))
country_lab_data_SA_genotypeFreq_estimates <-
country_lab_data_SA_genotypeFreq %>%
mutate(pooled=if_else(`Isolating Lab`=="(Pooled)",1,0.5)) %>%
filter(Final_genotype %in% names(SA_geno_freq[1:9])) %>%
filter(lab_total >=20) %>%
mutate(country_lab=paste(Country_Origin, ":", `Isolating Lab`)) %>%
ggplot() +
geom_linerange( mapping=aes(y=country_lab, xmin=min, xmax=max, colour=Country_Origin)) +
geom_point(mapping=aes(y=country_lab, x=lab_freq, colour=Country_Origin, alpha=pooled), size=1) +
facet_wrap(. ~ Final_genotype, ncol=3) + theme_bw() +
scale_color_manual(values=c("red","blue","orange","black")) +
guides(alpha = "none") +
labs(color="Country:", x="Genotype prevalence estimate (%)", y="Source laboratory") +
theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=10, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, colour = "black"), axis.text.y = element_text(size = 7, colour = "black"), legend.title = element_text(size=10, face="bold"), legend.position="bottom")
country_lab_data_SA_genotypeFreq_estimates
country_lab_data_SA_genotypeFreq
## # A tibble: 436 × 8
## Country_Origin Final_genotype n lab_total lab_freq min max
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Bangladesh 2 17 1591 0.0107 0.00563 0.0157
## 2 Bangladesh 2.0.1 103 1591 0.0647 0.0526 0.0768
## 3 Bangladesh 2.1.7 34 1591 0.0214 0.0143 0.0285
## 4 Bangladesh 2.2.1 1 1591 0.000629 0 0.00186
## 5 Bangladesh 2.3.3 274 1591 0.172 0.154 0.191
## 6 Bangladesh 2.4 1 1591 0.000629 0 0.00186
## 7 Bangladesh 3.0.1 6 1591 0.00377 0.000759 0.00678
## 8 Bangladesh 3.1.2 12 1591 0.00754 0.00329 0.0118
## 9 Bangladesh 3.2 1 1591 0.000629 0 0.00186
## 10 Bangladesh 3.2.2 264 1591 0.166 0.148 0.184
## # … with 426 more rows, and 1 more variable: `Isolating Lab` <chr>
pdf(file=paste0("SuppFig12_country_lab_data_SA_genotypeFreq_estimates.pdf"),width=8,height=12)
country_lab_data_SA_genotypeFreq_estimates
dev.off()
## quartz_off_screen
## 2
png(file=paste0("SuppFig12_country_lab_data_SA_genotypeFreq_estimates.png"),width=8,height=12, units="in", res=400)
country_lab_data_SA_genotypeFreq_estimates
dev.off()
## quartz_off_screen
## 2
# lab-specific
lab_level <- country_lab_data_SA_genotypeFreq %>%
filter(`Isolating Lab` != "(Pooled)")
pooled <- country_lab_data_SA_genotypeFreq %>%
filter(`Isolating Lab` == "(Pooled)") %>%
rename(pooled_freq=lab_freq, pooled_min=min, pooled_max=max)
labs_pooled <- left_join(lab_level, pooled, by=c("Country_Origin", "Final_genotype"))
labs_pooled <- labs_pooled %>%
filter(Final_genotype %in% names(SA_geno_freq[1:9])) %>%
filter(lab_total.x >=20)
nrow(labs_pooled)
## [1] 140
labs_pooled_overlap_estimate <- labs_pooled %>%
filter((lab_freq > pooled_min) & (lab_freq < pooled_max))
nrow(labs_pooled_overlap_estimate)/nrow(labs_pooled)
## [1] 0.2214286
labs_pooled_overlap_interval <- labs_pooled %>%
filter(!( (max<pooled_min) | (min>pooled_max)))
nrow(labs_pooled_overlap_interval)/nrow(labs_pooled)
## [1] 0.6714286
country_lab_year_data_SA <- country_lab_year_data %>%
filter(Country_Origin %in% c("Bangladesh", "India", "Nepal", "Pakistan"))
# set genotype colours
geno_info_SA <- read_csv("https://raw.githubusercontent.com/katholt/genotyphi/main/Genotype_specification.csv") %>% filter(Genotype %in% country_lab_year_data_SA$Final_genotype)
## Rows: 82 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Genotype, H58 Genotype, Gene in CT18 reference sequence, Product, ...
## dbl (1): Position in CT18 reference sequence
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
geno_colours_SA <- c(geno_info_SA$`Colour code`)
names(geno_colours_SA) <- c(geno_info_SA$Genotype)
# distinguish H58 subtypes
geno_colours_SA["4.3.1.2.EA2"] = "#ee37c9"
geno_colours_SA["4.3.1.2.EA3"] = "#94165f"
geno_colours_SA <- geno_colours_SA[order(names(geno_colours_SA))] # sort for legend
lab_counts <- table(country_lab_year_data$Country_lab_label)
# South Asian countries with multiple source labs, with N≥20 per lab
country_facet_lab_SAsia <- country_lab_year_data_SA %>%
filter(Country_lab_label %in% names(lab_counts[lab_counts>=20])) %>%
ggplot(aes(fill=Final_genotype, x=Year)) +
geom_bar(position="fill") +
scale_fill_manual(values = geno_colours_SA)+
labs(fill = "Genotype", x="", y="")+
facet_wrap(~Country_Origin + `Isolating Lab`, ncol=7) +
theme_bw() +
theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=9, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"), legend.title=element_text(size=8, face="bold"), legend.text = element_text(size=8), legend.key.size=unit(.25, "cm"), legend.position="bottom") +
guides(fill=guide_legend(nrow=4))
country_facet_lab_SAsia
country_lab_year_data_Nigeria <- country_lab_year_data %>%
filter(Country_Origin == "Nigeria")
# set genotype colours
geno_info_nigeria <- read_csv("https://raw.githubusercontent.com/katholt/genotyphi/main/Genotype_specification.csv") %>% filter(Genotype %in% country_lab_year_data_Nigeria$Final_genotype)
## Rows: 82 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Genotype, H58 Genotype, Gene in CT18 reference sequence, Product, ...
## dbl (1): Position in CT18 reference sequence
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
geno_colours_nigeria <- c(geno_info_nigeria$`Colour code`,"grey")
names(geno_colours_nigeria) <- c(geno_info_nigeria$Genotype, "Other")
geno_colours_nigeria <- geno_colours_nigeria[order(names(geno_colours_nigeria))] # sort for legend
country_facet_lab_Nigeria <- country_lab_year_data_Nigeria %>%
filter(Country_lab_label %in% names(lab_counts[lab_counts>=10])) %>%
filter(!is.na(`Isolating Lab`)) %>%
ggplot(aes(fill=Final_genotype, x=Year)) +
geom_bar(position="fill") +
scale_fill_manual(values = geno_colours_nigeria)+
labs(fill = "Genotype", x="", y="", title="c) Annual genotype prevalence, by lab")+
facet_wrap(~`Isolating Lab`, ncol=2) +
theme_bw() +
theme(strip.background = element_rect(fill ="#f0f0f5"),
strip.text=element_text(size=10, face="bold"),
axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"),
axis.text.y = element_text(size = 7, colour = "black"),
legend.title=element_text(size=8, face="bold"),
legend.text = element_text(size=8), legend.key.size=unit(.25, "cm"),
legend.position="right",
plot.title=element_text(size = 9, colour = "black", face="bold"))
country_facet_lab_Nigeria
lab_N_Nigeria <- country_lab_year_data_Nigeria %>%
group_by(`Isolating Lab`)%>%
summarize(lab_total =n())
lab_N_Nigeria
## # A tibble: 5 × 2
## `Isolating Lab` lab_total
## <chr> <int>
## 1 AKT 6
## 2 CDC 10
## 3 IBA 14
## 4 PHE 15
## 5 ZMC 105
country_lab_data_Nigeria_genotypeFreq <- country_lab_year_data_Nigeria %>%
group_by(`Isolating Lab`, Final_genotype) %>%
summarise (n=n())%>%
left_join(lab_N_Nigeria) %>% # annual lab counts
mutate(lab_freq = n/lab_total,
lab_freq_sd = sqrt((lab_freq*(1-lab_freq))/lab_total),
min=lab_freq-1.96*lab_freq_sd,
max=lab_freq+1.96*lab_freq_sd)
## `summarise()` has grouped output by 'Isolating Lab'. You can override using the `.groups` argument.
## Joining, by = "Isolating Lab"
country_lab_data_Nigeria_genotypeFreq %>% filter(Final_genotype=="3.1.1")
## # A tibble: 5 × 8
## # Groups: Isolating Lab [5]
## `Isolating Lab` Final_genotype n lab_total lab_freq lab_freq_sd min
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 AKT 3.1.1 5 6 0.833 0.152 0.535
## 2 CDC 3.1.1 7 10 0.7 0.145 0.416
## 3 IBA 3.1.1 14 14 1 0 1
## 4 PHE 3.1.1 8 15 0.533 0.129 0.281
## 5 ZMC 3.1.1 67 105 0.638 0.0469 0.546
## # … with 1 more variable: max <dbl>
# add pooled estimate
country_lab_data_Nigeria_genotypeFreq <- tgc_from2010_countryFreq_allGeno %>%
filter(Country_Origin %in% c("Nigeria")) %>%
rename(lab_freq=p, min=p_lowerCI, max=p_upperCI, lab_total=N, n=x) %>%
mutate(`Isolating Lab`="(Pooled)") %>%
bind_rows(country_lab_data_Nigeria_genotypeFreq) %>%
ungroup() %>%
select(-c(Region, lab_freq_sd, Country_Origin))
# choose most common genotypes to plot
nigeria_geno_freq <- rev(sort(table(country_lab_year_data_Nigeria$Final_genotype)))
country_lab_data_Nigeria_genotypeFreq_estimates <-
country_lab_data_Nigeria_genotypeFreq %>%
mutate(pooled=if_else(`Isolating Lab`=="(Pooled)","Pooled","Individual")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CDC", "*US (CDC)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PHE", "*England (PHE)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="IBA", "Ibadan (IBD)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="ZMC", "Abuja (ZMC)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="(Pooled)", "Nigeria (Pooled)")) %>%
filter(Final_genotype %in% names(nigeria_geno_freq[1:5])) %>%
filter(lab_total >=10) %>%
ggplot() +
geom_linerange( mapping=aes(y=`Isolating Lab`, xmin=min, xmax=max, colour=pooled)) +
geom_point(mapping=aes(y=`Isolating Lab`, x=lab_freq, colour=pooled), size=1) +
facet_wrap(. ~ Final_genotype, ncol=3) + theme_bw() +
scale_color_manual(values=c("red","black")) +
xlab("") +
ylab("") +
labs(color="Estimate type")+
ggtitle("a) Genotype prevalence, by lab") +
theme(strip.background = element_rect(fill ="#f0f0f5"),
strip.text=element_text(size=10, face="bold"),
axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"),
axis.text.y = element_text(size = 7, colour = "black"),
legend.title=element_text(size=8, face="bold"),
legend.text = element_text(size=8), legend.key.size=unit(.25, "cm"),
legend.position="right",
plot.title=element_text(size = 9, colour = "black", face="bold"))
country_lab_data_Nigeria_genotypeFreq_estimates
annual_res_country_lab <- tgc_meta %>%
mutate(cefR = as.numeric(CEP),
XDR = if_else(XDR == T, 1, 0),
MDR = if_else(MDR ==T, 1, 0),
cipNS = if_else(cipNS == T, 1, 0),
cipR = if_else(cipR == T, 1, 0),
aziR = if_else(aziR == T, 1, 0)) %>% #convert PW AMR calls to numeric values
filter(Year >= 2010) %>% # restrict to 2010 onwards
group_by(Country_Origin, Year, `Isolating Lab`) %>%
summarize(n=n(),
sumMDR= sum(MDR),
MDR= round(sumMDR/n*100, digits =0),
sumXDR= sum(XDR),
XDR= round(sumXDR/n*100, digits = 0),
sumcipNS = sum(cipNS),
cipNS = round(sumcipNS/n*100, digits = 0),
sumcipR = sum(cipR),
cipR = round(sumcipR/n*100, digits = 0),
sumcefR = sum(cefR),
cefR = round(sumcefR/n*100, digits = 0),
sumaziR = sum(aziR),
aziR = round(sumaziR/n*100, digits =0)
) %>%
unique()
## `summarise()` has grouped output by 'Country_Origin', 'Year'. You can override
## using the `.groups` argument.
# clean up AMR data (for surveillance-ready samples)
res_data <- tgc_meta %>%
mutate(cefR = as.numeric(CEP),
XDR = if_else(XDR == T, 1, 0),
MDR = if_else(MDR ==T, 1, 0),
cipNS = if_else(cipNS == T, 1, 0),
cipR = if_else(cipR == T, 1, 0),
aziR = if_else(aziR == T, 1, 0)) %>% #convert PW AMR calls to numeric values
select(Country_Origin, Region, Year, cefR, cipNS, cipR, MDR, XDR, aziR, IncHI1)
# map object has "UK", "Vietnam", changes ours to match this string format
res_data$Country_Origin <- replace(res_data$Country_Origin, res_data$Country_Origin=="United Kingdom", "UK")
res_data$Country_Origin <- replace(res_data$Country_Origin, res_data$Country_Origin=="Viet Nam", "Vietnam")
# prevalence per country, for 2010 onwards
res_freq_table <- res_data %>%
filter(Year >= 2010) %>% # restrict to last 10 years
group_by(Country_Origin)%>%
summarize(n=n(),
sumMDR= sum(MDR),
MDR= round(sumMDR/n*100, digits =0),
sumXDR= sum(XDR),
XDR= round(sumXDR/n*100, digits = 0),
sumcipNS = sum(cipNS),
cipNS = round(sumcipNS/n*100, digits = 0),
sumcipR = sum(cipR),
cipR = round(sumcipR/n*100, digits = 0),
sumcefR = sum(cefR),
cefR = round(sumcefR/n*100, digits = 0),
sumaziR = sum(aziR),
aziR = round(sumaziR/n*100, digits =0),
MDR_sd = sqrt((sumMDR/n*(1-sumMDR/n))/n),
XDR_sd = sqrt((sumXDR/n*(1-sumXDR/n))/n),
cipNS_sd = sqrt((sumcipNS/n*(1-sumcipNS/n))/n),
cipR_sd = sqrt((sumcipR/n*(1-sumcipR/n))/n),
cefR_sd = sqrt((sumcefR/n*(1-sumcefR/n))/n),
aziR_sd = sqrt((sumaziR/n*(1-sumaziR/n))/n)
) %>%
unique()
# annual by country
res_freq_annual_table <- res_data %>%
filter(Year >= 2000) %>% # restrict to last 20 years
mutate(MDR_IncHI1 = ifelse(IncHI1, MDR, 0)) %>%
group_by(Country_Origin, Year) %>%
summarize(n=n(),
sumMDR= sum(MDR),
MDR= round(sumMDR/n*100, digits =0),
sumXDR= sum(XDR),
XDR= round(sumXDR/n*100, digits = 0),
sumcipNS = sum(cipNS),
cipNS = round(sumcipNS/n*100, digits = 0),
sumcipR = sum(cipR),
cipR = round(sumcipR/n*100, digits = 0),
sumcefR = sum(cefR),
cefR = round(sumcefR/n*100, digits = 0),
sumaziR = sum(aziR),
aziR = round(sumaziR/n*100, digits =0),
MDR_sd = sqrt((sumMDR/n*(1-sumMDR/n))/n),
XDR_sd = sqrt((sumXDR/n*(1-sumXDR/n))/n),
cipNS_sd = sqrt((sumcipNS/n*(1-sumcipNS/n))/n),
cipR_sd = sqrt((sumcipR/n*(1-sumcipR/n))/n),
cefR_sd = sqrt((sumcefR/n*(1-sumcefR/n))/n),
aziR_sd = sqrt((sumaziR/n*(1-sumaziR/n))/n),
sumMDR_IncHI1 = sum(MDR_IncHI1),
`IncHI1 amongst MDR` = round(sumMDR_IncHI1/sumMDR*100, digits=0)) %>%
unique()
## `summarise()` has grouped output by 'Country_Origin'. You can override using the
## `.groups` argument.
southAsia_amr_lab <- annual_res_country_lab %>%
filter(Country_Origin %in% c("Bangladesh","India","Nepal","Pakistan")) %>%
filter(n>=10) %>% # include only estimates with N≥10
as.data.frame() %>%
melt(id.vars=c("Country_Origin","Year","Isolating Lab"), variable.name="AMR") %>%
filter(AMR %in% c("MDR","cefR","cipR","cipNS"))
southAsia_amr_lab$value <- as.numeric(as.character(southAsia_amr_lab$value))
southAsia_amr_lab$Year <- as.numeric(as.character(southAsia_amr_lab$Year))
res_freq_annual_table$Year <- as.numeric(as.character(res_freq_annual_table$Year))
## add line for combined estimate
south_asia_amr <- res_freq_annual_table %>%
filter(Country_Origin %in% c("Bangladesh","India","Nepal","Pakistan")) %>%
filter(n>=10) %>% # plot only those years with sufficient data
as.data.frame() %>%
melt(id.vars=c("Country_Origin","Year"), variable.name="AMR") %>%
filter(AMR %in% c("MDR","cefR","cipR","cipNS")) %>%
mutate(`Isolating Lab`="Pooled") %>%
bind_rows(southAsia_amr_lab) %>%
mutate(estimate_type=ifelse(`Isolating Lab`=="Pooled",2,1)) %>%
replace_na(list(estimate_type = 1, `Isolating Lab` = "UNK")) # NA lab is unknown lab from Pakistan data in Wong 2015
# order levels
south_asia_amr$`Isolating Lab` <- factor(south_asia_amr$`Isolating Lab`)
index_combined <- which("Pooled" == levels(south_asia_amr$`Isolating Lab`))
south_asia_amr$`Isolating Lab` <- factor(south_asia_amr$`Isolating Lab`, levels=levels(south_asia_amr$`Isolating Lab`)[c(index_combined, 1:(index_combined-1), (index_combined+1):length(levels(south_asia_amr$`Isolating Lab`)))], ordered=T)
southAsia_amr_lab_plot_combinedEst <- ggplot(south_asia_amr, aes(x=Year, y=value, group=`Isolating Lab`, colour=`Isolating Lab`)) +
geom_line(size = 1, aes(group=`Isolating Lab`)) +
geom_point(size = 1) +
facet_wrap(. ~ Country_Origin + AMR) + theme_bw() +
scale_color_manual(values=c(alpha("black",1), alpha(unname(alphabet())[1:22],0.5))) +
ylab("Resistance (%)") +
theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=10, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, colour = "black"), legend.title = element_text(size=10, face="bold"), legend.position="bottom") +
guides(colour=guide_legend(nrow=3))
southAsia_amr_lab_plot_combinedEst
pdf(file=paste0("Fig6_annualGeno_annualAMR_by_lab_SouthAsia.pdf"),width=8,height=12)
(country_facet_lab_SAsia + ggtitle("a) Annual genotype prevalence, by lab")) / (southAsia_amr_lab_plot_combinedEst + ggtitle("b) Annual AMR prevalence, by lab"))
dev.off()
## quartz_off_screen
## 2
png(file=paste0("Fig6_annualGeno_annualAMR_by_lab_SouthAsia.png"),width=8, height=12, units="in", res=400)
(country_facet_lab_SAsia + ggtitle("a) Annual genotype prevalence, by lab")) / (southAsia_amr_lab_plot_combinedEst + ggtitle("b) Annual AMR prevalence, by lab"))
dev.off()
## quartz_off_screen
## 2
country_lab_data_SA_amrFreq <- tgc_from2010 %>%
filter(Country_Origin %in% c("Bangladesh", "India", "Nepal", "Pakistan")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CDC", "*US (CDC)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PHE", "*England (PHE)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="MDU", "*Australia (MDU)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="ESR", "*New Zealand (ESR)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CHR", "Dhaka (CHR)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="ICD", "Dhaka (ICD)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CMH", "Chittagong (ICD)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="AII", "New Delhi (AII)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CML", "Ludhiana (CML)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CNH", "New Delhi (CNH)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KHM", "Mumbai (KHM)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KKC", "Chennai (KKC)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="RDT", "Andhra Pradesh (RDT)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="SAP", "New Delhi (SAP)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CMC", "Vellore (CMC)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KIMS", "Mumbai (KIM)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PGI", "Chandigarh (PGI)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="SJB", "Bengaluru (SJB)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KUH", "Dhulikhel (KUH)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PAH", "Lalitpur (PAH)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="AKU", "Karachi (AKU)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="JHL", "Lahore (JHL)")) %>%
select(Country_Origin, Region, `Isolating Lab`, MDR, cipNS, cipR, aziR, CEP, XDR) %>%
rename(cefR=CEP) %>%
mutate(Country_lab_label = paste(Country_Origin, ":", `Isolating Lab`)) %>%
select(-Region, -`Isolating Lab`) %>%
melt(id.vars=c("Country_lab_label","Country_Origin"), variable.name="AMR") %>%
group_by(Country_lab_label, Country_Origin, AMR) %>%
summarise (n=n(),
x=sum(value),
.groups = "keep") %>%
mutate(lab_freq = x/n,
lab_freq_sd = sqrt((lab_freq*(1-lab_freq))/n),
min=lab_freq-1.96*lab_freq_sd,
max=lab_freq+1.96*lab_freq_sd)
# add pooled estimate
SA_amrFreq <- tgc_from2010 %>%
select(Country_Origin, MDR, cipNS, cipR, aziR, CEP, XDR) %>%
rename(cefR=CEP) %>%
filter(Country_Origin %in% c("Bangladesh", "India", "Nepal", "Pakistan")) %>%
melt(id.vars=c("Country_Origin"), variable.name="AMR") %>%
group_by(Country_Origin, AMR) %>%
summarise (n=n(),
x=sum(value),
.groups = "keep") %>%
mutate(lab_freq = x/n,
lab_freq_sd = sqrt((lab_freq*(1-lab_freq))/n),
min=lab_freq-1.96*lab_freq_sd,
max=lab_freq+1.96*lab_freq_sd) %>%
mutate(Country_lab_label=paste0(Country_Origin, " : (Pooled)")) %>%
bind_rows(country_lab_data_SA_amrFreq)
country_lab_data_SA_amrFreq_estimates <-
SA_amrFreq %>%
mutate(pooled=if_else(grepl("(Pooled)", Country_lab_label),1,0.5)) %>%
filter(n >=20) %>% #minimum annual count per lab
ggplot() +
geom_linerange( mapping=aes(y=Country_lab_label, xmin=min, xmax=max, colour=Country_Origin)) +
geom_point(mapping=aes(y=Country_lab_label, x=lab_freq, colour=Country_Origin, alpha=pooled), size=1) +
facet_wrap(. ~ AMR, ncol=3) + theme_bw() +
scale_color_manual(values=c("red","blue","orange","black")) +
xlab("AMR prevalence estimate (%)") +
ylab("Source laboratory") +
guides(alpha = "none") +
labs(color="Country:")+
theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=10, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, colour = "black"), axis.text.y = element_text(size = 8, colour = "black"), legend.title = element_text(size=10, face="bold"), legend.position="bottom")
country_lab_data_SA_amrFreq_estimates
pdf(file=paste0("SuppFig13_country_lab_data_SA_amrFreq_estimates.pdf"),width=8, height=9)
country_lab_data_SA_amrFreq_estimates
dev.off()
## quartz_off_screen
## 2
png(file=paste0("SuppFig13_country_lab_data_SA_amrFreq_estimates.png"),width=8, height=9, units="in", res=400)
country_lab_data_SA_amrFreq_estimates
dev.off()
## quartz_off_screen
## 2
# lab-specific
lab_level <- SA_amrFreq %>%
filter(!(Country_lab_label %in% c("Bangladesh : (Pooled)", "India : (Pooled)", "Nepal : (Pooled)", "Pakistan : (Pooled)")))
pooled <- SA_amrFreq %>%
filter(Country_lab_label %in% c("Bangladesh : (Pooled)", "India : (Pooled)", "Nepal : (Pooled)", "Pakistan : (Pooled)")) %>%
rename(pooled_freq=lab_freq, pooled_min=min, pooled_max=max)
labs_pooled <- left_join(lab_level, pooled, by=c("Country_Origin", "AMR"))
labs_pooled <- labs_pooled %>%
filter(n.x >=20)
nrow(labs_pooled)
## [1] 168
labs_pooled_overlap_estimate <- labs_pooled %>%
filter((lab_freq > pooled_min) & (lab_freq < pooled_max))
nrow(labs_pooled_overlap_estimate)/nrow(labs_pooled)
## [1] 0.2440476
labs_pooled_overlap_interval <- labs_pooled %>%
filter(!( (max<pooled_min) | (min>pooled_max)))
nrow(labs_pooled_overlap_interval)/nrow(labs_pooled)
## [1] 0.5833333
country_lab_data_Nigeria_amrFreq <- tgc_from2010 %>%
filter(Country_Origin == "Nigeria") %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CDC", "*US (CDC)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PHE", "*England (PHE)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="IBA", "Ibadan (IBD)")) %>%
mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="ZMC", "Abuja (ZMC)")) %>%
select(`Isolating Lab`, MDR, cipNS, cipR, aziR, CEP, XDR) %>%
rename(cefR=CEP) %>%
melt(id.vars=c("Isolating Lab"), variable.name="AMR") %>%
group_by(`Isolating Lab`, AMR) %>%
summarise (n=n(),
x=sum(value),
.groups = "keep") %>%
mutate(lab_freq = x/n,
lab_freq_sd = sqrt((lab_freq*(1-lab_freq))/n),
min=lab_freq-1.96*lab_freq_sd,
max=lab_freq+1.96*lab_freq_sd)
nigeria_amrFreq <- tgc_from2010 %>%
select(Country_Origin, MDR, cipNS, cipR, aziR, CEP, XDR) %>%
rename(cefR=CEP) %>%
filter(Country_Origin == "Nigeria") %>%
melt(id.vars=c("Country_Origin"), variable.name="AMR") %>%
group_by(Country_Origin, AMR) %>%
summarise (n=n(),
x=sum(value),
.groups = "keep") %>%
mutate(lab_freq = x/n,
lab_freq_sd = sqrt((lab_freq*(1-lab_freq))/n),
min=lab_freq-1.96*lab_freq_sd,
max=lab_freq+1.96*lab_freq_sd) %>%
mutate(Country_Origin=replace(Country_Origin, Country_Origin=="Nigeria", "Nigeria (Pooled)")) %>%
rename(`Isolating Lab`=Country_Origin) %>%
bind_rows(country_lab_data_Nigeria_amrFreq)
country_lab_data_Nigeria_amrFreq_estimates <-
nigeria_amrFreq %>%
filter(n>=10) %>%
mutate(pooled=ifelse(`Isolating Lab`=="Nigeria (Pooled)","Pooled","Individual")) %>%
ggplot() +
geom_linerange( mapping=aes(y=`Isolating Lab`, xmin=min, xmax=max, colour=pooled)) +
geom_point(mapping=aes(y=`Isolating Lab`, x=lab_freq, colour=pooled), size=1) +
facet_wrap(. ~ AMR, ncol=3) + theme_bw() +
labs(x="", y="", title="b) AMR prevalence, by lab", colour="Estimate type") +
scale_color_manual(values=c("red","black")) +
theme(strip.background = element_rect(fill ="#f0f0f5"),
strip.text=element_text(size=10, face="bold"),
axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"),
axis.text.y = element_text(size = 7, colour = "black"),
legend.title=element_text(size=8, face="bold"),
legend.text = element_text(size=8), legend.key.size=unit(.25, "cm"),
legend.position="right",
plot.title=element_text(size = 9, colour = "black", face="bold"))
country_lab_data_Nigeria_amrFreq_estimates
pdf(file=paste0("Fig7_Nigeria_labcomparisons.pdf"),width=5, height=8)
country_lab_data_Nigeria_genotypeFreq_estimates / country_lab_data_Nigeria_amrFreq_estimates / country_facet_lab_Nigeria
dev.off()
## quartz_off_screen
## 2
png(file=paste0("Fig7_Nigeria_labcomparisons.png"),width=5, height=8, units="in", res=400)
country_lab_data_Nigeria_genotypeFreq_estimates / country_lab_data_Nigeria_amrFreq_estimates / country_facet_lab_Nigeria
dev.off()
## quartz_off_screen
## 2